Branch data Line data Source code
1 : : /*
2 : : * Copyright © 2014 Intel Corporation
3 : : *
4 : : * Permission is hereby granted, free of charge, to any person obtaining a
5 : : * copy of this software and associated documentation files (the "Software"),
6 : : * to deal in the Software without restriction, including without limitation
7 : : * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 : : * and/or sell copies of the Software, and to permit persons to whom the
9 : : * Software is furnished to do so, subject to the following conditions:
10 : : *
11 : : * The above copyright notice and this permission notice (including the next
12 : : * paragraph) shall be included in all copies or substantial portions of the
13 : : * Software.
14 : : *
15 : : * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 : : * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 : : * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 : : * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 : : * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 : : * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 : : * IN THE SOFTWARE.
22 : : *
23 : : * Authors:
24 : : * Ben Widawsky <ben@bwidawsk.net>
25 : : * Michel Thierry <michel.thierry@intel.com>
26 : : * Thomas Daniel <thomas.daniel@intel.com>
27 : : * Oscar Mateo <oscar.mateo@intel.com>
28 : : *
29 : : */
30 : :
31 : : /**
32 : : * DOC: Logical Rings, Logical Ring Contexts and Execlists
33 : : *
34 : : * Motivation:
35 : : * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
36 : : * These expanded contexts enable a number of new abilities, especially
37 : : * "Execlists" (also implemented in this file).
38 : : *
39 : : * One of the main differences with the legacy HW contexts is that logical
40 : : * ring contexts incorporate many more things to the context's state, like
41 : : * PDPs or ringbuffer control registers:
42 : : *
43 : : * The reason why PDPs are included in the context is straightforward: as
44 : : * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
45 : : * contained there mean you don't need to do a ppgtt->switch_mm yourself,
46 : : * instead, the GPU will do it for you on the context switch.
47 : : *
48 : : * But, what about the ringbuffer control registers (head, tail, etc..)?
49 : : * shouldn't we just need a set of those per engine command streamer? This is
50 : : * where the name "Logical Rings" starts to make sense: by virtualizing the
51 : : * rings, the engine cs shifts to a new "ring buffer" with every context
52 : : * switch. When you want to submit a workload to the GPU you: A) choose your
53 : : * context, B) find its appropriate virtualized ring, C) write commands to it
54 : : * and then, finally, D) tell the GPU to switch to that context.
55 : : *
56 : : * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
57 : : * to a contexts is via a context execution list, ergo "Execlists".
58 : : *
59 : : * LRC implementation:
60 : : * Regarding the creation of contexts, we have:
61 : : *
62 : : * - One global default context.
63 : : * - One local default context for each opened fd.
64 : : * - One local extra context for each context create ioctl call.
65 : : *
66 : : * Now that ringbuffers belong per-context (and not per-engine, like before)
67 : : * and that contexts are uniquely tied to a given engine (and not reusable,
68 : : * like before) we need:
69 : : *
70 : : * - One ringbuffer per-engine inside each context.
71 : : * - One backing object per-engine inside each context.
72 : : *
73 : : * The global default context starts its life with these new objects fully
74 : : * allocated and populated. The local default context for each opened fd is
75 : : * more complex, because we don't know at creation time which engine is going
76 : : * to use them. To handle this, we have implemented a deferred creation of LR
77 : : * contexts:
78 : : *
79 : : * The local context starts its life as a hollow or blank holder, that only
80 : : * gets populated for a given engine once we receive an execbuffer. If later
81 : : * on we receive another execbuffer ioctl for the same context but a different
82 : : * engine, we allocate/populate a new ringbuffer and context backing object and
83 : : * so on.
84 : : *
85 : : * Finally, regarding local contexts created using the ioctl call: as they are
86 : : * only allowed with the render ring, we can allocate & populate them right
87 : : * away (no need to defer anything, at least for now).
88 : : *
89 : : * Execlists implementation:
90 : : * Execlists are the new method by which, on gen8+ hardware, workloads are
91 : : * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
92 : : * This method works as follows:
93 : : *
94 : : * When a request is committed, its commands (the BB start and any leading or
95 : : * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
96 : : * for the appropriate context. The tail pointer in the hardware context is not
97 : : * updated at this time, but instead, kept by the driver in the ringbuffer
98 : : * structure. A structure representing this request is added to a request queue
99 : : * for the appropriate engine: this structure contains a copy of the context's
100 : : * tail after the request was written to the ring buffer and a pointer to the
101 : : * context itself.
102 : : *
103 : : * If the engine's request queue was empty before the request was added, the
104 : : * queue is processed immediately. Otherwise the queue will be processed during
105 : : * a context switch interrupt. In any case, elements on the queue will get sent
106 : : * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
107 : : * globally unique 20-bits submission ID.
108 : : *
109 : : * When execution of a request completes, the GPU updates the context status
110 : : * buffer with a context complete event and generates a context switch interrupt.
111 : : * During the interrupt handling, the driver examines the events in the buffer:
112 : : * for each context complete event, if the announced ID matches that on the head
113 : : * of the request queue, then that request is retired and removed from the queue.
114 : : *
115 : : * After processing, if any requests were retired and the queue is not empty
116 : : * then a new execution list can be submitted. The two requests at the front of
117 : : * the queue are next to be submitted but since a context may not occur twice in
118 : : * an execution list, if subsequent requests have the same ID as the first then
119 : : * the two requests must be combined. This is done simply by discarding requests
120 : : * at the head of the queue until either only one requests is left (in which case
121 : : * we use a NULL second context) or the first two requests have unique IDs.
122 : : *
123 : : * By always executing the first two requests in the queue the driver ensures
124 : : * that the GPU is kept as busy as possible. In the case where a single context
125 : : * completes but a second context is still executing, the request for this second
126 : : * context will be at the head of the queue when we remove the first one. This
127 : : * request will then be resubmitted along with a new request for a different context,
128 : : * which will cause the hardware to continue executing the second request and queue
129 : : * the new request (the GPU detects the condition of a context getting preempted
130 : : * with the same context and optimizes the context switch flow by not doing
131 : : * preemption, but just sampling the new tail pointer).
132 : : *
133 : : */
134 : : #include <linux/interrupt.h>
135 : :
136 : : #include "i915_drv.h"
137 : : #include "i915_perf.h"
138 : : #include "i915_trace.h"
139 : : #include "i915_vgpu.h"
140 : : #include "intel_context.h"
141 : : #include "intel_engine_pm.h"
142 : : #include "intel_gt.h"
143 : : #include "intel_gt_pm.h"
144 : : #include "intel_gt_requests.h"
145 : : #include "intel_lrc_reg.h"
146 : : #include "intel_mocs.h"
147 : : #include "intel_reset.h"
148 : : #include "intel_ring.h"
149 : : #include "intel_workarounds.h"
150 : :
151 : : #define RING_EXECLIST_QFULL (1 << 0x2)
152 : : #define RING_EXECLIST1_VALID (1 << 0x3)
153 : : #define RING_EXECLIST0_VALID (1 << 0x4)
154 : : #define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
155 : : #define RING_EXECLIST1_ACTIVE (1 << 0x11)
156 : : #define RING_EXECLIST0_ACTIVE (1 << 0x12)
157 : :
158 : : #define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
159 : : #define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
160 : : #define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
161 : : #define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
162 : : #define GEN8_CTX_STATUS_COMPLETE (1 << 4)
163 : : #define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
164 : :
165 : : #define GEN8_CTX_STATUS_COMPLETED_MASK \
166 : : (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
167 : :
168 : : #define CTX_DESC_FORCE_RESTORE BIT_ULL(2)
169 : :
170 : : #define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE (0x1) /* lower csb dword */
171 : : #define GEN12_CTX_SWITCH_DETAIL(csb_dw) ((csb_dw) & 0xF) /* upper csb dword */
172 : : #define GEN12_CSB_SW_CTX_ID_MASK GENMASK(25, 15)
173 : : #define GEN12_IDLE_CTX_ID 0x7FF
174 : : #define GEN12_CSB_CTX_VALID(csb_dw) \
175 : : (FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)
176 : :
177 : : /* Typical size of the average request (2 pipecontrols and a MI_BB) */
178 : : #define EXECLISTS_REQUEST_SIZE 64 /* bytes */
179 : : #define WA_TAIL_DWORDS 2
180 : : #define WA_TAIL_BYTES (sizeof(u32) * WA_TAIL_DWORDS)
181 : :
182 : : struct virtual_engine {
183 : : struct intel_engine_cs base;
184 : : struct intel_context context;
185 : :
186 : : /*
187 : : * We allow only a single request through the virtual engine at a time
188 : : * (each request in the timeline waits for the completion fence of
189 : : * the previous before being submitted). By restricting ourselves to
190 : : * only submitting a single request, each request is placed on to a
191 : : * physical to maximise load spreading (by virtue of the late greedy
192 : : * scheduling -- each real engine takes the next available request
193 : : * upon idling).
194 : : */
195 : : struct i915_request *request;
196 : :
197 : : /*
198 : : * We keep a rbtree of available virtual engines inside each physical
199 : : * engine, sorted by priority. Here we preallocate the nodes we need
200 : : * for the virtual engine, indexed by physical_engine->id.
201 : : */
202 : : struct ve_node {
203 : : struct rb_node rb;
204 : : int prio;
205 : : } nodes[I915_NUM_ENGINES];
206 : :
207 : : /*
208 : : * Keep track of bonded pairs -- restrictions upon on our selection
209 : : * of physical engines any particular request may be submitted to.
210 : : * If we receive a submit-fence from a master engine, we will only
211 : : * use one of sibling_mask physical engines.
212 : : */
213 : : struct ve_bond {
214 : : const struct intel_engine_cs *master;
215 : : intel_engine_mask_t sibling_mask;
216 : : } *bonds;
217 : : unsigned int num_bonds;
218 : :
219 : : /* And finally, which physical engines this virtual engine maps onto. */
220 : : unsigned int num_siblings;
221 : : struct intel_engine_cs *siblings[0];
222 : : };
223 : :
224 : 0 : static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
225 : : {
226 : 0 : GEM_BUG_ON(!intel_engine_is_virtual(engine));
227 : 0 : return container_of(engine, struct virtual_engine, base);
228 : : }
229 : :
230 : : static int __execlists_context_alloc(struct intel_context *ce,
231 : : struct intel_engine_cs *engine);
232 : :
233 : : static void execlists_init_reg_state(u32 *reg_state,
234 : : const struct intel_context *ce,
235 : : const struct intel_engine_cs *engine,
236 : : const struct intel_ring *ring,
237 : : bool close);
238 : : static void
239 : : __execlists_update_reg_state(const struct intel_context *ce,
240 : : const struct intel_engine_cs *engine,
241 : : u32 head);
242 : :
243 : 0 : static void mark_eio(struct i915_request *rq)
244 : : {
245 [ # # ]: 0 : if (i915_request_completed(rq))
246 : : return;
247 : :
248 : 0 : GEM_BUG_ON(i915_request_signaled(rq));
249 : :
250 : 0 : dma_fence_set_error(&rq->fence, -EIO);
251 : 0 : i915_request_mark_complete(rq);
252 : : }
253 : :
254 : : static struct i915_request *
255 : 0 : active_request(const struct intel_timeline * const tl, struct i915_request *rq)
256 : : {
257 : 0 : struct i915_request *active = rq;
258 : :
259 : 0 : rcu_read_lock();
260 [ # # ]: 0 : list_for_each_entry_continue_reverse(rq, &tl->requests, link) {
261 [ # # ]: 0 : if (i915_request_completed(rq))
262 : : break;
263 : :
264 : 0 : active = rq;
265 : : }
266 : 0 : rcu_read_unlock();
267 : :
268 : 0 : return active;
269 : : }
270 : :
271 : 0 : static inline u32 intel_hws_preempt_address(struct intel_engine_cs *engine)
272 : : {
273 : 0 : return (i915_ggtt_offset(engine->status_page.vma) +
274 : : I915_GEM_HWS_PREEMPT_ADDR);
275 : : }
276 : :
277 : : static inline void
278 : 0 : ring_set_paused(const struct intel_engine_cs *engine, int state)
279 : : {
280 : : /*
281 : : * We inspect HWS_PREEMPT with a semaphore inside
282 : : * engine->emit_fini_breadcrumb. If the dword is true,
283 : : * the ring is paused as the semaphore will busywait
284 : : * until the dword is false.
285 : : */
286 : 0 : engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
287 : 0 : if (state)
288 : 0 : wmb();
289 : : }
290 : :
291 : 0 : static inline struct i915_priolist *to_priolist(struct rb_node *rb)
292 : : {
293 : 0 : return rb_entry(rb, struct i915_priolist, node);
294 : : }
295 : :
296 : 0 : static inline int rq_prio(const struct i915_request *rq)
297 : : {
298 : 0 : return rq->sched.attr.priority;
299 : : }
300 : :
301 : 0 : static int effective_prio(const struct i915_request *rq)
302 : : {
303 : 0 : int prio = rq_prio(rq);
304 : :
305 : : /*
306 : : * If this request is special and must not be interrupted at any
307 : : * cost, so be it. Note we are only checking the most recent request
308 : : * in the context and so may be masking an earlier vip request. It
309 : : * is hoped that under the conditions where nopreempt is used, this
310 : : * will not matter (i.e. all requests to that context will be
311 : : * nopreempt for as long as desired).
312 : : */
313 [ # # ]: 0 : if (i915_request_has_nopreempt(rq))
314 : 0 : prio = I915_PRIORITY_UNPREEMPTABLE;
315 : :
316 : : /*
317 : : * On unwinding the active request, we give it a priority bump
318 : : * if it has completed waiting on any semaphore. If we know that
319 : : * the request has already started, we can prevent an unwanted
320 : : * preempt-to-idle cycle by taking that into account now.
321 : : */
322 [ # # ]: 0 : if (__i915_request_has_started(rq))
323 : 0 : prio |= I915_PRIORITY_NOSEMAPHORE;
324 : :
325 : : /* Restrict mere WAIT boosts from triggering preemption */
326 : 0 : BUILD_BUG_ON(__NO_PREEMPTION & ~I915_PRIORITY_MASK); /* only internal */
327 : 0 : return prio | __NO_PREEMPTION;
328 : : }
329 : :
330 : 0 : static int queue_prio(const struct intel_engine_execlists *execlists)
331 : : {
332 : 0 : struct i915_priolist *p;
333 : 0 : struct rb_node *rb;
334 : :
335 : 0 : rb = rb_first_cached(&execlists->queue);
336 : 0 : if (!rb)
337 : : return INT_MIN;
338 : :
339 : : /*
340 : : * As the priolist[] are inverted, with the highest priority in [0],
341 : : * we have to flip the index value to become priority.
342 : : */
343 : 0 : p = to_priolist(rb);
344 : 0 : return ((p->priority + 1) << I915_USER_PRIORITY_SHIFT) - ffs(p->used);
345 : : }
346 : :
347 : 0 : static inline bool need_preempt(const struct intel_engine_cs *engine,
348 : : const struct i915_request *rq,
349 : : struct rb_node *rb)
350 : : {
351 : 0 : int last_prio;
352 : :
353 [ # # ]: 0 : if (!intel_engine_has_semaphores(engine))
354 : : return false;
355 : :
356 : : /*
357 : : * Check if the current priority hint merits a preemption attempt.
358 : : *
359 : : * We record the highest value priority we saw during rescheduling
360 : : * prior to this dequeue, therefore we know that if it is strictly
361 : : * less than the current tail of ESLP[0], we do not need to force
362 : : * a preempt-to-idle cycle.
363 : : *
364 : : * However, the priority hint is a mere hint that we may need to
365 : : * preempt. If that hint is stale or we may be trying to preempt
366 : : * ourselves, ignore the request.
367 : : *
368 : : * More naturally we would write
369 : : * prio >= max(0, last);
370 : : * except that we wish to prevent triggering preemption at the same
371 : : * priority level: the task that is running should remain running
372 : : * to preserve FIFO ordering of dependencies.
373 : : */
374 : 0 : last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
375 [ # # ]: 0 : if (engine->execlists.queue_priority_hint <= last_prio)
376 : : return false;
377 : :
378 : : /*
379 : : * Check against the first request in ELSP[1], it will, thanks to the
380 : : * power of PI, be the highest priority of that context.
381 : : */
382 [ # # ]: 0 : if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
383 [ # # ]: 0 : rq_prio(list_next_entry(rq, sched.link)) > last_prio)
384 : : return true;
385 : :
386 [ # # ]: 0 : if (rb) {
387 : 0 : struct virtual_engine *ve =
388 : 0 : rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
389 : 0 : bool preempt = false;
390 : :
391 [ # # ]: 0 : if (engine == ve->siblings[0]) { /* only preempt one sibling */
392 : 0 : struct i915_request *next;
393 : :
394 : 0 : rcu_read_lock();
395 [ # # ]: 0 : next = READ_ONCE(ve->request);
396 [ # # ]: 0 : if (next)
397 : 0 : preempt = rq_prio(next) > last_prio;
398 : 0 : rcu_read_unlock();
399 : : }
400 : :
401 [ # # ]: 0 : if (preempt)
402 : : return preempt;
403 : : }
404 : :
405 : : /*
406 : : * If the inflight context did not trigger the preemption, then maybe
407 : : * it was the set of queued requests? Pick the highest priority in
408 : : * the queue (the first active priolist) and see if it deserves to be
409 : : * running instead of ELSP[0].
410 : : *
411 : : * The highest priority request in the queue can not be either
412 : : * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
413 : : * context, it's priority would not exceed ELSP[0] aka last_prio.
414 : : */
415 [ # # ]: 0 : return queue_prio(&engine->execlists) > last_prio;
416 : : }
417 : :
418 : : __maybe_unused static inline bool
419 : : assert_priority_queue(const struct i915_request *prev,
420 : : const struct i915_request *next)
421 : : {
422 : : /*
423 : : * Without preemption, the prev may refer to the still active element
424 : : * which we refuse to let go.
425 : : *
426 : : * Even with preemption, there are times when we think it is better not
427 : : * to preempt and leave an ostensibly lower priority request in flight.
428 : : */
429 : : if (i915_request_is_active(prev))
430 : : return true;
431 : :
432 : : return rq_prio(prev) >= rq_prio(next);
433 : : }
434 : :
435 : : /*
436 : : * The context descriptor encodes various attributes of a context,
437 : : * including its GTT address and some flags. Because it's fairly
438 : : * expensive to calculate, we'll just do it once and cache the result,
439 : : * which remains valid until the context is unpinned.
440 : : *
441 : : * This is what a descriptor looks like, from LSB to MSB::
442 : : *
443 : : * bits 0-11: flags, GEN8_CTX_* (cached in ctx->desc_template)
444 : : * bits 12-31: LRCA, GTT address of (the HWSP of) this context
445 : : * bits 32-52: ctx ID, a globally unique tag (highest bit used by GuC)
446 : : * bits 53-54: mbz, reserved for use by hardware
447 : : * bits 55-63: group ID, currently unused and set to 0
448 : : *
449 : : * Starting from Gen11, the upper dword of the descriptor has a new format:
450 : : *
451 : : * bits 32-36: reserved
452 : : * bits 37-47: SW context ID
453 : : * bits 48:53: engine instance
454 : : * bit 54: mbz, reserved for use by hardware
455 : : * bits 55-60: SW counter
456 : : * bits 61-63: engine class
457 : : *
458 : : * engine info, SW context ID and SW counter need to form a unique number
459 : : * (Context ID) per lrc.
460 : : */
461 : : static u64
462 : : lrc_descriptor(struct intel_context *ce, struct intel_engine_cs *engine)
463 : : {
464 : : u64 desc;
465 : :
466 : : desc = INTEL_LEGACY_32B_CONTEXT;
467 : : if (i915_vm_is_4lvl(ce->vm))
468 : : desc = INTEL_LEGACY_64B_CONTEXT;
469 : : desc <<= GEN8_CTX_ADDRESSING_MODE_SHIFT;
470 : :
471 : : desc |= GEN8_CTX_VALID | GEN8_CTX_PRIVILEGE;
472 : : if (IS_GEN(engine->i915, 8))
473 : : desc |= GEN8_CTX_L3LLC_COHERENT;
474 : :
475 : : desc |= i915_ggtt_offset(ce->state); /* bits 12-31 */
476 : : /*
477 : : * The following 32bits are copied into the OA reports (dword 2).
478 : : * Consider updating oa_get_render_ctx_id in i915_perf.c when changing
479 : : * anything below.
480 : : */
481 : : if (INTEL_GEN(engine->i915) >= 11) {
482 : : desc |= (u64)engine->instance << GEN11_ENGINE_INSTANCE_SHIFT;
483 : : /* bits 48-53 */
484 : :
485 : : desc |= (u64)engine->class << GEN11_ENGINE_CLASS_SHIFT;
486 : : /* bits 61-63 */
487 : : }
488 : :
489 : : return desc;
490 : : }
491 : :
492 : 0 : static inline unsigned int dword_in_page(void *addr)
493 : : {
494 : 0 : return offset_in_page(addr) / sizeof(u32);
495 : : }
496 : :
497 : 0 : static void set_offsets(u32 *regs,
498 : : const u8 *data,
499 : : const struct intel_engine_cs *engine,
500 : : bool clear)
501 : : #define NOP(x) (BIT(7) | (x))
502 : : #define LRI(count, flags) ((flags) << 6 | (count) | BUILD_BUG_ON_ZERO(count >= BIT(6)))
503 : : #define POSTED BIT(0)
504 : : #define REG(x) (((x) >> 2) | BUILD_BUG_ON_ZERO(x >= 0x200))
505 : : #define REG16(x) \
506 : : (((x) >> 9) | BIT(7) | BUILD_BUG_ON_ZERO(x >= 0x10000)), \
507 : : (((x) >> 2) & 0x7f)
508 : : #define END(x) 0, (x)
509 : : {
510 : 0 : const u32 base = engine->mmio_base;
511 : :
512 [ # # ]: 0 : while (*data) {
513 : 0 : u8 count, flags;
514 : :
515 [ # # ]: 0 : if (*data & BIT(7)) { /* skip */
516 : 0 : count = *data++ & ~BIT(7);
517 [ # # ]: 0 : if (clear)
518 : 0 : memset32(regs, MI_NOOP, count);
519 : 0 : regs += count;
520 : 0 : continue;
521 : : }
522 : :
523 : 0 : count = *data & 0x3f;
524 : 0 : flags = *data >> 6;
525 : 0 : data++;
526 : :
527 : 0 : *regs = MI_LOAD_REGISTER_IMM(count);
528 [ # # ]: 0 : if (flags & POSTED)
529 : 0 : *regs |= MI_LRI_FORCE_POSTED;
530 [ # # ]: 0 : if (INTEL_GEN(engine->i915) >= 11)
531 : 0 : *regs |= MI_LRI_CS_MMIO;
532 : 0 : regs++;
533 : :
534 : 0 : GEM_BUG_ON(!count);
535 : 0 : do {
536 : 0 : u32 offset = 0;
537 : 0 : u8 v;
538 : :
539 : 0 : do {
540 : 0 : v = *data++;
541 : 0 : offset <<= 7;
542 : 0 : offset |= v & ~BIT(7);
543 [ # # ]: 0 : } while (v & BIT(7));
544 : :
545 : 0 : regs[0] = base + (offset << 2);
546 [ # # ]: 0 : if (clear)
547 : 0 : regs[1] = 0;
548 : 0 : regs += 2;
549 [ # # ]: 0 : } while (--count);
550 : : }
551 : :
552 [ # # ]: 0 : if (clear) {
553 : 0 : u8 count = *++data;
554 : :
555 : : /* Clear past the tail for HW access */
556 : 0 : GEM_BUG_ON(dword_in_page(regs) > count);
557 : 0 : memset32(regs, MI_NOOP, count - dword_in_page(regs));
558 : :
559 : : /* Close the batch; used mainly by live_lrc_layout() */
560 : 0 : *regs = MI_BATCH_BUFFER_END;
561 [ # # ]: 0 : if (INTEL_GEN(engine->i915) >= 10)
562 : 0 : *regs |= BIT(0);
563 : : }
564 : 0 : }
565 : :
566 : : static const u8 gen8_xcs_offsets[] = {
567 : : NOP(1),
568 : : LRI(11, 0),
569 : : REG16(0x244),
570 : : REG(0x034),
571 : : REG(0x030),
572 : : REG(0x038),
573 : : REG(0x03c),
574 : : REG(0x168),
575 : : REG(0x140),
576 : : REG(0x110),
577 : : REG(0x11c),
578 : : REG(0x114),
579 : : REG(0x118),
580 : :
581 : : NOP(9),
582 : : LRI(9, 0),
583 : : REG16(0x3a8),
584 : : REG16(0x28c),
585 : : REG16(0x288),
586 : : REG16(0x284),
587 : : REG16(0x280),
588 : : REG16(0x27c),
589 : : REG16(0x278),
590 : : REG16(0x274),
591 : : REG16(0x270),
592 : :
593 : : NOP(13),
594 : : LRI(2, 0),
595 : : REG16(0x200),
596 : : REG(0x028),
597 : :
598 : : END(80)
599 : : };
600 : :
601 : : static const u8 gen9_xcs_offsets[] = {
602 : : NOP(1),
603 : : LRI(14, POSTED),
604 : : REG16(0x244),
605 : : REG(0x034),
606 : : REG(0x030),
607 : : REG(0x038),
608 : : REG(0x03c),
609 : : REG(0x168),
610 : : REG(0x140),
611 : : REG(0x110),
612 : : REG(0x11c),
613 : : REG(0x114),
614 : : REG(0x118),
615 : : REG(0x1c0),
616 : : REG(0x1c4),
617 : : REG(0x1c8),
618 : :
619 : : NOP(3),
620 : : LRI(9, POSTED),
621 : : REG16(0x3a8),
622 : : REG16(0x28c),
623 : : REG16(0x288),
624 : : REG16(0x284),
625 : : REG16(0x280),
626 : : REG16(0x27c),
627 : : REG16(0x278),
628 : : REG16(0x274),
629 : : REG16(0x270),
630 : :
631 : : NOP(13),
632 : : LRI(1, POSTED),
633 : : REG16(0x200),
634 : :
635 : : NOP(13),
636 : : LRI(44, POSTED),
637 : : REG(0x028),
638 : : REG(0x09c),
639 : : REG(0x0c0),
640 : : REG(0x178),
641 : : REG(0x17c),
642 : : REG16(0x358),
643 : : REG(0x170),
644 : : REG(0x150),
645 : : REG(0x154),
646 : : REG(0x158),
647 : : REG16(0x41c),
648 : : REG16(0x600),
649 : : REG16(0x604),
650 : : REG16(0x608),
651 : : REG16(0x60c),
652 : : REG16(0x610),
653 : : REG16(0x614),
654 : : REG16(0x618),
655 : : REG16(0x61c),
656 : : REG16(0x620),
657 : : REG16(0x624),
658 : : REG16(0x628),
659 : : REG16(0x62c),
660 : : REG16(0x630),
661 : : REG16(0x634),
662 : : REG16(0x638),
663 : : REG16(0x63c),
664 : : REG16(0x640),
665 : : REG16(0x644),
666 : : REG16(0x648),
667 : : REG16(0x64c),
668 : : REG16(0x650),
669 : : REG16(0x654),
670 : : REG16(0x658),
671 : : REG16(0x65c),
672 : : REG16(0x660),
673 : : REG16(0x664),
674 : : REG16(0x668),
675 : : REG16(0x66c),
676 : : REG16(0x670),
677 : : REG16(0x674),
678 : : REG16(0x678),
679 : : REG16(0x67c),
680 : : REG(0x068),
681 : :
682 : : END(176)
683 : : };
684 : :
685 : : static const u8 gen12_xcs_offsets[] = {
686 : : NOP(1),
687 : : LRI(13, POSTED),
688 : : REG16(0x244),
689 : : REG(0x034),
690 : : REG(0x030),
691 : : REG(0x038),
692 : : REG(0x03c),
693 : : REG(0x168),
694 : : REG(0x140),
695 : : REG(0x110),
696 : : REG(0x1c0),
697 : : REG(0x1c4),
698 : : REG(0x1c8),
699 : : REG(0x180),
700 : : REG16(0x2b4),
701 : :
702 : : NOP(5),
703 : : LRI(9, POSTED),
704 : : REG16(0x3a8),
705 : : REG16(0x28c),
706 : : REG16(0x288),
707 : : REG16(0x284),
708 : : REG16(0x280),
709 : : REG16(0x27c),
710 : : REG16(0x278),
711 : : REG16(0x274),
712 : : REG16(0x270),
713 : :
714 : : END(80)
715 : : };
716 : :
717 : : static const u8 gen8_rcs_offsets[] = {
718 : : NOP(1),
719 : : LRI(14, POSTED),
720 : : REG16(0x244),
721 : : REG(0x034),
722 : : REG(0x030),
723 : : REG(0x038),
724 : : REG(0x03c),
725 : : REG(0x168),
726 : : REG(0x140),
727 : : REG(0x110),
728 : : REG(0x11c),
729 : : REG(0x114),
730 : : REG(0x118),
731 : : REG(0x1c0),
732 : : REG(0x1c4),
733 : : REG(0x1c8),
734 : :
735 : : NOP(3),
736 : : LRI(9, POSTED),
737 : : REG16(0x3a8),
738 : : REG16(0x28c),
739 : : REG16(0x288),
740 : : REG16(0x284),
741 : : REG16(0x280),
742 : : REG16(0x27c),
743 : : REG16(0x278),
744 : : REG16(0x274),
745 : : REG16(0x270),
746 : :
747 : : NOP(13),
748 : : LRI(1, 0),
749 : : REG(0x0c8),
750 : :
751 : : END(80)
752 : : };
753 : :
754 : : static const u8 gen9_rcs_offsets[] = {
755 : : NOP(1),
756 : : LRI(14, POSTED),
757 : : REG16(0x244),
758 : : REG(0x34),
759 : : REG(0x30),
760 : : REG(0x38),
761 : : REG(0x3c),
762 : : REG(0x168),
763 : : REG(0x140),
764 : : REG(0x110),
765 : : REG(0x11c),
766 : : REG(0x114),
767 : : REG(0x118),
768 : : REG(0x1c0),
769 : : REG(0x1c4),
770 : : REG(0x1c8),
771 : :
772 : : NOP(3),
773 : : LRI(9, POSTED),
774 : : REG16(0x3a8),
775 : : REG16(0x28c),
776 : : REG16(0x288),
777 : : REG16(0x284),
778 : : REG16(0x280),
779 : : REG16(0x27c),
780 : : REG16(0x278),
781 : : REG16(0x274),
782 : : REG16(0x270),
783 : :
784 : : NOP(13),
785 : : LRI(1, 0),
786 : : REG(0xc8),
787 : :
788 : : NOP(13),
789 : : LRI(44, POSTED),
790 : : REG(0x28),
791 : : REG(0x9c),
792 : : REG(0xc0),
793 : : REG(0x178),
794 : : REG(0x17c),
795 : : REG16(0x358),
796 : : REG(0x170),
797 : : REG(0x150),
798 : : REG(0x154),
799 : : REG(0x158),
800 : : REG16(0x41c),
801 : : REG16(0x600),
802 : : REG16(0x604),
803 : : REG16(0x608),
804 : : REG16(0x60c),
805 : : REG16(0x610),
806 : : REG16(0x614),
807 : : REG16(0x618),
808 : : REG16(0x61c),
809 : : REG16(0x620),
810 : : REG16(0x624),
811 : : REG16(0x628),
812 : : REG16(0x62c),
813 : : REG16(0x630),
814 : : REG16(0x634),
815 : : REG16(0x638),
816 : : REG16(0x63c),
817 : : REG16(0x640),
818 : : REG16(0x644),
819 : : REG16(0x648),
820 : : REG16(0x64c),
821 : : REG16(0x650),
822 : : REG16(0x654),
823 : : REG16(0x658),
824 : : REG16(0x65c),
825 : : REG16(0x660),
826 : : REG16(0x664),
827 : : REG16(0x668),
828 : : REG16(0x66c),
829 : : REG16(0x670),
830 : : REG16(0x674),
831 : : REG16(0x678),
832 : : REG16(0x67c),
833 : : REG(0x68),
834 : :
835 : : END(176)
836 : : };
837 : :
838 : : static const u8 gen11_rcs_offsets[] = {
839 : : NOP(1),
840 : : LRI(15, POSTED),
841 : : REG16(0x244),
842 : : REG(0x034),
843 : : REG(0x030),
844 : : REG(0x038),
845 : : REG(0x03c),
846 : : REG(0x168),
847 : : REG(0x140),
848 : : REG(0x110),
849 : : REG(0x11c),
850 : : REG(0x114),
851 : : REG(0x118),
852 : : REG(0x1c0),
853 : : REG(0x1c4),
854 : : REG(0x1c8),
855 : : REG(0x180),
856 : :
857 : : NOP(1),
858 : : LRI(9, POSTED),
859 : : REG16(0x3a8),
860 : : REG16(0x28c),
861 : : REG16(0x288),
862 : : REG16(0x284),
863 : : REG16(0x280),
864 : : REG16(0x27c),
865 : : REG16(0x278),
866 : : REG16(0x274),
867 : : REG16(0x270),
868 : :
869 : : LRI(1, POSTED),
870 : : REG(0x1b0),
871 : :
872 : : NOP(10),
873 : : LRI(1, 0),
874 : : REG(0x0c8),
875 : :
876 : : END(80)
877 : : };
878 : :
879 : : static const u8 gen12_rcs_offsets[] = {
880 : : NOP(1),
881 : : LRI(13, POSTED),
882 : : REG16(0x244),
883 : : REG(0x034),
884 : : REG(0x030),
885 : : REG(0x038),
886 : : REG(0x03c),
887 : : REG(0x168),
888 : : REG(0x140),
889 : : REG(0x110),
890 : : REG(0x1c0),
891 : : REG(0x1c4),
892 : : REG(0x1c8),
893 : : REG(0x180),
894 : : REG16(0x2b4),
895 : :
896 : : NOP(5),
897 : : LRI(9, POSTED),
898 : : REG16(0x3a8),
899 : : REG16(0x28c),
900 : : REG16(0x288),
901 : : REG16(0x284),
902 : : REG16(0x280),
903 : : REG16(0x27c),
904 : : REG16(0x278),
905 : : REG16(0x274),
906 : : REG16(0x270),
907 : :
908 : : LRI(3, POSTED),
909 : : REG(0x1b0),
910 : : REG16(0x5a8),
911 : : REG16(0x5ac),
912 : :
913 : : NOP(6),
914 : : LRI(1, 0),
915 : : REG(0x0c8),
916 : :
917 : : END(80)
918 : : };
919 : :
920 : : #undef END
921 : : #undef REG16
922 : : #undef REG
923 : : #undef LRI
924 : : #undef NOP
925 : :
926 : 0 : static const u8 *reg_offsets(const struct intel_engine_cs *engine)
927 : : {
928 : : /*
929 : : * The gen12+ lists only have the registers we program in the basic
930 : : * default state. We rely on the context image using relative
931 : : * addressing to automatic fixup the register state between the
932 : : * physical engines for virtual engine.
933 : : */
934 : 0 : GEM_BUG_ON(INTEL_GEN(engine->i915) >= 12 &&
935 : : !intel_engine_has_relative_mmio(engine));
936 : :
937 : 0 : if (engine->class == RENDER_CLASS) {
938 [ # # # # ]: 0 : if (INTEL_GEN(engine->i915) >= 12)
939 : : return gen12_rcs_offsets;
940 [ # # # # ]: 0 : else if (INTEL_GEN(engine->i915) >= 11)
941 : : return gen11_rcs_offsets;
942 [ # # # # ]: 0 : else if (INTEL_GEN(engine->i915) >= 9)
943 : : return gen9_rcs_offsets;
944 : : else
945 : 0 : return gen8_rcs_offsets;
946 : : } else {
947 [ # # # # ]: 0 : if (INTEL_GEN(engine->i915) >= 12)
948 : : return gen12_xcs_offsets;
949 [ # # # # ]: 0 : else if (INTEL_GEN(engine->i915) >= 9)
950 : : return gen9_xcs_offsets;
951 : : else
952 : 0 : return gen8_xcs_offsets;
953 : : }
954 : : }
955 : :
956 : : static struct i915_request *
957 : 0 : __unwind_incomplete_requests(struct intel_engine_cs *engine)
958 : : {
959 : 0 : struct i915_request *rq, *rn, *active = NULL;
960 : 0 : struct list_head *uninitialized_var(pl);
961 : 0 : int prio = I915_PRIORITY_INVALID;
962 : :
963 : 0 : lockdep_assert_held(&engine->active.lock);
964 : :
965 [ # # ]: 0 : list_for_each_entry_safe_reverse(rq, rn,
966 : : &engine->active.requests,
967 : : sched.link) {
968 [ # # ]: 0 : if (i915_request_completed(rq))
969 : 0 : continue; /* XXX */
970 : :
971 : 0 : __i915_request_unsubmit(rq);
972 : :
973 : : /*
974 : : * Push the request back into the queue for later resubmission.
975 : : * If this request is not native to this physical engine (i.e.
976 : : * it came from a virtual source), push it back onto the virtual
977 : : * engine so that it can be moved across onto another physical
978 : : * engine as load dictates.
979 : : */
980 [ # # ]: 0 : if (likely(rq->execution_mask == engine->mask)) {
981 : 0 : GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
982 [ # # ]: 0 : if (rq_prio(rq) != prio) {
983 : 0 : prio = rq_prio(rq);
984 : 0 : pl = i915_sched_lookup_priolist(engine, prio);
985 : : }
986 : 0 : GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
987 : :
988 : 0 : list_move(&rq->sched.link, pl);
989 : 0 : set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
990 : :
991 : 0 : active = rq;
992 : : } else {
993 : 0 : struct intel_engine_cs *owner = rq->context->engine;
994 : :
995 : : /*
996 : : * Decouple the virtual breadcrumb before moving it
997 : : * back to the virtual engine -- we don't want the
998 : : * request to complete in the background and try
999 : : * and cancel the breadcrumb on the virtual engine
1000 : : * (instead of the old engine where it is linked)!
1001 : : */
1002 [ # # ]: 0 : if (test_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT,
1003 : 0 : &rq->fence.flags)) {
1004 : 0 : spin_lock_nested(&rq->lock,
1005 : : SINGLE_DEPTH_NESTING);
1006 : 0 : i915_request_cancel_breadcrumb(rq);
1007 : 0 : spin_unlock(&rq->lock);
1008 : : }
1009 : 0 : rq->engine = owner;
1010 : 0 : owner->submit_request(rq);
1011 : 0 : active = NULL;
1012 : : }
1013 : : }
1014 : :
1015 : 0 : return active;
1016 : : }
1017 : :
1018 : : struct i915_request *
1019 : 0 : execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
1020 : : {
1021 : 0 : struct intel_engine_cs *engine =
1022 : 0 : container_of(execlists, typeof(*engine), execlists);
1023 : :
1024 : 0 : return __unwind_incomplete_requests(engine);
1025 : : }
1026 : :
1027 : : static inline void
1028 : 0 : execlists_context_status_change(struct i915_request *rq, unsigned long status)
1029 : : {
1030 : : /*
1031 : : * Only used when GVT-g is enabled now. When GVT-g is disabled,
1032 : : * The compiler should eliminate this function as dead-code.
1033 : : */
1034 : 0 : if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
1035 : 0 : return;
1036 : :
1037 : : atomic_notifier_call_chain(&rq->engine->context_status_notifier,
1038 : : status, rq);
1039 : : }
1040 : :
1041 : 0 : static void intel_engine_context_in(struct intel_engine_cs *engine)
1042 : : {
1043 : 0 : unsigned long flags;
1044 : :
1045 [ # # ]: 0 : if (READ_ONCE(engine->stats.enabled) == 0)
1046 : : return;
1047 : :
1048 : 0 : write_seqlock_irqsave(&engine->stats.lock, flags);
1049 : :
1050 [ # # ]: 0 : if (engine->stats.enabled > 0) {
1051 [ # # ]: 0 : if (engine->stats.active++ == 0)
1052 : 0 : engine->stats.start = ktime_get();
1053 : 0 : GEM_BUG_ON(engine->stats.active == 0);
1054 : : }
1055 : :
1056 : 0 : write_sequnlock_irqrestore(&engine->stats.lock, flags);
1057 : : }
1058 : :
1059 : 0 : static void intel_engine_context_out(struct intel_engine_cs *engine)
1060 : : {
1061 : 0 : unsigned long flags;
1062 : :
1063 [ # # ]: 0 : if (READ_ONCE(engine->stats.enabled) == 0)
1064 : : return;
1065 : :
1066 : 0 : write_seqlock_irqsave(&engine->stats.lock, flags);
1067 : :
1068 [ # # ]: 0 : if (engine->stats.enabled > 0) {
1069 : 0 : ktime_t last;
1070 : :
1071 [ # # # # ]: 0 : if (engine->stats.active && --engine->stats.active == 0) {
1072 : : /*
1073 : : * Decrement the active context count and in case GPU
1074 : : * is now idle add up to the running total.
1075 : : */
1076 : 0 : last = ktime_sub(ktime_get(), engine->stats.start);
1077 : :
1078 : 0 : engine->stats.total = ktime_add(engine->stats.total,
1079 : : last);
1080 [ # # ]: 0 : } else if (engine->stats.active == 0) {
1081 : : /*
1082 : : * After turning on engine stats, context out might be
1083 : : * the first event in which case we account from the
1084 : : * time stats gathering was turned on.
1085 : : */
1086 : 0 : last = ktime_sub(ktime_get(), engine->stats.enabled_at);
1087 : :
1088 : 0 : engine->stats.total = ktime_add(engine->stats.total,
1089 : : last);
1090 : : }
1091 : : }
1092 : :
1093 : 0 : write_sequnlock_irqrestore(&engine->stats.lock, flags);
1094 : : }
1095 : :
1096 : 0 : static int lrc_ring_mi_mode(const struct intel_engine_cs *engine)
1097 : : {
1098 : 0 : if (INTEL_GEN(engine->i915) >= 12)
1099 : : return 0x60;
1100 [ # # # # ]: 0 : else if (INTEL_GEN(engine->i915) >= 9)
1101 : : return 0x54;
1102 [ # # # # ]: 0 : else if (engine->class == RENDER_CLASS)
1103 : : return 0x58;
1104 : : else
1105 : : return -1;
1106 : : }
1107 : :
1108 : : static void
1109 : : execlists_check_context(const struct intel_context *ce,
1110 : : const struct intel_engine_cs *engine)
1111 : : {
1112 : : const struct intel_ring *ring = ce->ring;
1113 : : u32 *regs = ce->lrc_reg_state;
1114 : : bool valid = true;
1115 : : int x;
1116 : :
1117 : : if (regs[CTX_RING_START] != i915_ggtt_offset(ring->vma)) {
1118 : : pr_err("%s: context submitted with incorrect RING_START [%08x], expected %08x\n",
1119 : : engine->name,
1120 : : regs[CTX_RING_START],
1121 : : i915_ggtt_offset(ring->vma));
1122 : : regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
1123 : : valid = false;
1124 : : }
1125 : :
1126 : : if ((regs[CTX_RING_CTL] & ~(RING_WAIT | RING_WAIT_SEMAPHORE)) !=
1127 : : (RING_CTL_SIZE(ring->size) | RING_VALID)) {
1128 : : pr_err("%s: context submitted with incorrect RING_CTL [%08x], expected %08x\n",
1129 : : engine->name,
1130 : : regs[CTX_RING_CTL],
1131 : : (u32)(RING_CTL_SIZE(ring->size) | RING_VALID));
1132 : : regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
1133 : : valid = false;
1134 : : }
1135 : :
1136 : : x = lrc_ring_mi_mode(engine);
1137 : : if (x != -1 && regs[x + 1] & (regs[x + 1] >> 16) & STOP_RING) {
1138 : : pr_err("%s: context submitted with STOP_RING [%08x] in RING_MI_MODE\n",
1139 : : engine->name, regs[x + 1]);
1140 : : regs[x + 1] &= ~STOP_RING;
1141 : : regs[x + 1] |= STOP_RING << 16;
1142 : : valid = false;
1143 : : }
1144 : :
1145 : : WARN_ONCE(!valid, "Invalid lrc state found before submission\n");
1146 : : }
1147 : :
1148 : 0 : static void restore_default_state(struct intel_context *ce,
1149 : : struct intel_engine_cs *engine)
1150 : : {
1151 : 0 : u32 *regs = ce->lrc_reg_state;
1152 : :
1153 [ # # ]: 0 : if (engine->pinned_default_state)
1154 : 0 : memcpy(regs, /* skip restoring the vanilla PPHWSP */
1155 : 0 : engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1156 : 0 : engine->context_size - PAGE_SIZE);
1157 : :
1158 : 0 : execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1159 : 0 : }
1160 : :
1161 : 0 : static void reset_active(struct i915_request *rq,
1162 : : struct intel_engine_cs *engine)
1163 : : {
1164 : 0 : struct intel_context * const ce = rq->context;
1165 : 0 : u32 head;
1166 : :
1167 : : /*
1168 : : * The executing context has been cancelled. We want to prevent
1169 : : * further execution along this context and propagate the error on
1170 : : * to anything depending on its results.
1171 : : *
1172 : : * In __i915_request_submit(), we apply the -EIO and remove the
1173 : : * requests' payloads for any banned requests. But first, we must
1174 : : * rewind the context back to the start of the incomplete request so
1175 : : * that we do not jump back into the middle of the batch.
1176 : : *
1177 : : * We preserve the breadcrumbs and semaphores of the incomplete
1178 : : * requests so that inter-timeline dependencies (i.e other timelines)
1179 : : * remain correctly ordered. And we defer to __i915_request_submit()
1180 : : * so that all asynchronous waits are correctly handled.
1181 : : */
1182 : 0 : ENGINE_TRACE(engine, "{ rq=%llx:%lld }\n",
1183 : : rq->fence.context, rq->fence.seqno);
1184 : :
1185 : : /* On resubmission of the active request, payload will be scrubbed */
1186 [ # # ]: 0 : if (i915_request_completed(rq))
1187 : 0 : head = rq->tail;
1188 : : else
1189 : 0 : head = active_request(ce->timeline, rq)->head;
1190 : 0 : head = intel_ring_wrap(ce->ring, head);
1191 : :
1192 : : /* Scrub the context image to prevent replaying the previous batch */
1193 : 0 : restore_default_state(ce, engine);
1194 : 0 : __execlists_update_reg_state(ce, engine, head);
1195 : :
1196 : : /* We've switched away, so this should be a no-op, but intent matters */
1197 : 0 : ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1198 : 0 : }
1199 : :
1200 : : static inline struct intel_engine_cs *
1201 : 0 : __execlists_schedule_in(struct i915_request *rq)
1202 : : {
1203 : 0 : struct intel_engine_cs * const engine = rq->engine;
1204 : 0 : struct intel_context * const ce = rq->context;
1205 : :
1206 : 0 : intel_context_get(ce);
1207 : :
1208 [ # # ]: 0 : if (unlikely(intel_context_is_banned(ce)))
1209 : 0 : reset_active(rq, engine);
1210 : :
1211 : 0 : if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
1212 : : execlists_check_context(ce, engine);
1213 : :
1214 [ # # ]: 0 : if (ce->tag) {
1215 : : /* Use a fixed tag for OA and friends */
1216 : 0 : ce->lrc_desc |= (u64)ce->tag << 32;
1217 : : } else {
1218 : : /* We don't need a strict matching tag, just different values */
1219 : 0 : ce->lrc_desc &= ~GENMASK_ULL(47, 37);
1220 : 0 : ce->lrc_desc |=
1221 : 0 : (u64)(++engine->context_tag % NUM_CONTEXT_TAG) <<
1222 : : GEN11_SW_CTX_ID_SHIFT;
1223 : 0 : BUILD_BUG_ON(NUM_CONTEXT_TAG > GEN12_MAX_CONTEXT_HW_ID);
1224 : : }
1225 : :
1226 : 0 : __intel_gt_pm_get(engine->gt);
1227 : 0 : execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
1228 : 0 : intel_engine_context_in(engine);
1229 : :
1230 : 0 : return engine;
1231 : : }
1232 : :
1233 : : static inline struct i915_request *
1234 : : execlists_schedule_in(struct i915_request *rq, int idx)
1235 : : {
1236 : : struct intel_context * const ce = rq->context;
1237 : : struct intel_engine_cs *old;
1238 : :
1239 : : GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
1240 : : trace_i915_request_in(rq, idx);
1241 : :
1242 : : old = READ_ONCE(ce->inflight);
1243 : : do {
1244 : : if (!old) {
1245 : : WRITE_ONCE(ce->inflight, __execlists_schedule_in(rq));
1246 : : break;
1247 : : }
1248 : : } while (!try_cmpxchg(&ce->inflight, &old, ptr_inc(old)));
1249 : :
1250 : : GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
1251 : : return i915_request_get(rq);
1252 : : }
1253 : :
1254 : 0 : static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
1255 : : {
1256 : 0 : struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
1257 : 0 : struct i915_request *next = READ_ONCE(ve->request);
1258 : :
1259 [ # # # # ]: 0 : if (next && next->execution_mask & ~rq->execution_mask)
1260 : 0 : tasklet_schedule(&ve->base.execlists.tasklet);
1261 : : }
1262 : :
1263 : : static inline void
1264 : 0 : __execlists_schedule_out(struct i915_request *rq,
1265 : : struct intel_engine_cs * const engine)
1266 : : {
1267 : 0 : struct intel_context * const ce = rq->context;
1268 : :
1269 : : /*
1270 : : * NB process_csb() is not under the engine->active.lock and hence
1271 : : * schedule_out can race with schedule_in meaning that we should
1272 : : * refrain from doing non-trivial work here.
1273 : : */
1274 : :
1275 : : /*
1276 : : * If we have just completed this context, the engine may now be
1277 : : * idle and we want to re-enter powersaving.
1278 : : */
1279 [ # # # # ]: 0 : if (list_is_last(&rq->link, &ce->timeline->requests) &&
1280 : 0 : i915_request_completed(rq))
1281 : 0 : intel_engine_add_retire(engine, ce->timeline);
1282 : :
1283 : 0 : intel_engine_context_out(engine);
1284 : 0 : execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
1285 : 0 : intel_gt_pm_put_async(engine->gt);
1286 : :
1287 : : /*
1288 : : * If this is part of a virtual engine, its next request may
1289 : : * have been blocked waiting for access to the active context.
1290 : : * We have to kick all the siblings again in case we need to
1291 : : * switch (e.g. the next request is not runnable on this
1292 : : * engine). Hopefully, we will already have submitted the next
1293 : : * request before the tasklet runs and do not need to rebuild
1294 : : * each virtual tree and kick everyone again.
1295 : : */
1296 [ # # ]: 0 : if (ce->engine != engine)
1297 [ # # ]: 0 : kick_siblings(rq, ce);
1298 : :
1299 : 0 : intel_context_put(ce);
1300 : 0 : }
1301 : :
1302 : : static inline void
1303 : 0 : execlists_schedule_out(struct i915_request *rq)
1304 : : {
1305 : 0 : struct intel_context * const ce = rq->context;
1306 : 0 : struct intel_engine_cs *cur, *old;
1307 : :
1308 : 0 : trace_i915_request_out(rq);
1309 : :
1310 : 0 : old = READ_ONCE(ce->inflight);
1311 : 0 : do
1312 [ # # ]: 0 : cur = ptr_unmask_bits(old, 2) ? ptr_dec(old) : NULL;
1313 [ # # # # ]: 0 : while (!try_cmpxchg(&ce->inflight, &old, cur));
1314 [ # # ]: 0 : if (!cur)
1315 : 0 : __execlists_schedule_out(rq, old);
1316 : :
1317 : 0 : i915_request_put(rq);
1318 : 0 : }
1319 : :
1320 : 0 : static u64 execlists_update_context(struct i915_request *rq)
1321 : : {
1322 : 0 : struct intel_context *ce = rq->context;
1323 : 0 : u64 desc = ce->lrc_desc;
1324 : 0 : u32 tail, prev;
1325 : :
1326 : : /*
1327 : : * WaIdleLiteRestore:bdw,skl
1328 : : *
1329 : : * We should never submit the context with the same RING_TAIL twice
1330 : : * just in case we submit an empty ring, which confuses the HW.
1331 : : *
1332 : : * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
1333 : : * the normal request to be able to always advance the RING_TAIL on
1334 : : * subsequent resubmissions (for lite restore). Should that fail us,
1335 : : * and we try and submit the same tail again, force the context
1336 : : * reload.
1337 : : *
1338 : : * If we need to return to a preempted context, we need to skip the
1339 : : * lite-restore and force it to reload the RING_TAIL. Otherwise, the
1340 : : * HW has a tendency to ignore us rewinding the TAIL to the end of
1341 : : * an earlier request.
1342 : : */
1343 [ # # ]: 0 : tail = intel_ring_set_tail(rq->ring, rq->tail);
1344 : 0 : prev = ce->lrc_reg_state[CTX_RING_TAIL];
1345 [ # # ]: 0 : if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
1346 : 0 : desc |= CTX_DESC_FORCE_RESTORE;
1347 : 0 : ce->lrc_reg_state[CTX_RING_TAIL] = tail;
1348 : 0 : rq->tail = rq->wa_tail;
1349 : :
1350 : : /*
1351 : : * Make sure the context image is complete before we submit it to HW.
1352 : : *
1353 : : * Ostensibly, writes (including the WCB) should be flushed prior to
1354 : : * an uncached write such as our mmio register access, the empirical
1355 : : * evidence (esp. on Braswell) suggests that the WC write into memory
1356 : : * may not be visible to the HW prior to the completion of the UC
1357 : : * register write and that we may begin execution from the context
1358 : : * before its image is complete leading to invalid PD chasing.
1359 : : */
1360 : 0 : wmb();
1361 : :
1362 : 0 : ce->lrc_desc &= ~CTX_DESC_FORCE_RESTORE;
1363 : 0 : return desc;
1364 : : }
1365 : :
1366 : 0 : static inline void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
1367 : : {
1368 : 0 : if (execlists->ctrl_reg) {
1369 : 0 : writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
1370 : 0 : writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
1371 : : } else {
1372 : 0 : writel(upper_32_bits(desc), execlists->submit_reg);
1373 : 0 : writel(lower_32_bits(desc), execlists->submit_reg);
1374 : : }
1375 : : }
1376 : :
1377 : : static __maybe_unused void
1378 : 0 : trace_ports(const struct intel_engine_execlists *execlists,
1379 : : const char *msg,
1380 : : struct i915_request * const *ports)
1381 : : {
1382 : 0 : const struct intel_engine_cs *engine =
1383 : 0 : container_of(execlists, typeof(*engine), execlists);
1384 : :
1385 : 0 : if (!ports[0])
1386 : : return;
1387 : :
1388 : 0 : ENGINE_TRACE(engine, "%s { %llx:%lld%s, %llx:%lld }\n", msg,
1389 : : ports[0]->fence.context,
1390 : : ports[0]->fence.seqno,
1391 : : i915_request_completed(ports[0]) ? "!" :
1392 : : i915_request_started(ports[0]) ? "*" :
1393 : : "",
1394 : : ports[1] ? ports[1]->fence.context : 0,
1395 : : ports[1] ? ports[1]->fence.seqno : 0);
1396 : : }
1397 : :
1398 : : static __maybe_unused bool
1399 : : assert_pending_valid(const struct intel_engine_execlists *execlists,
1400 : : const char *msg)
1401 : : {
1402 : : struct i915_request * const *port, *rq;
1403 : : struct intel_context *ce = NULL;
1404 : :
1405 : : trace_ports(execlists, msg, execlists->pending);
1406 : :
1407 : : if (!execlists->pending[0]) {
1408 : : GEM_TRACE_ERR("Nothing pending for promotion!\n");
1409 : : return false;
1410 : : }
1411 : :
1412 : : if (execlists->pending[execlists_num_ports(execlists)]) {
1413 : : GEM_TRACE_ERR("Excess pending[%d] for promotion!\n",
1414 : : execlists_num_ports(execlists));
1415 : : return false;
1416 : : }
1417 : :
1418 : : for (port = execlists->pending; (rq = *port); port++) {
1419 : : unsigned long flags;
1420 : : bool ok = true;
1421 : :
1422 : : GEM_BUG_ON(!kref_read(&rq->fence.refcount));
1423 : : GEM_BUG_ON(!i915_request_is_active(rq));
1424 : :
1425 : : if (ce == rq->context) {
1426 : : GEM_TRACE_ERR("Dup context:%llx in pending[%zd]\n",
1427 : : ce->timeline->fence_context,
1428 : : port - execlists->pending);
1429 : : return false;
1430 : : }
1431 : : ce = rq->context;
1432 : :
1433 : : /* Hold tightly onto the lock to prevent concurrent retires! */
1434 : : if (!spin_trylock_irqsave(&rq->lock, flags))
1435 : : continue;
1436 : :
1437 : : if (i915_request_completed(rq))
1438 : : goto unlock;
1439 : :
1440 : : if (i915_active_is_idle(&ce->active) &&
1441 : : !intel_context_is_barrier(ce)) {
1442 : : GEM_TRACE_ERR("Inactive context:%llx in pending[%zd]\n",
1443 : : ce->timeline->fence_context,
1444 : : port - execlists->pending);
1445 : : ok = false;
1446 : : goto unlock;
1447 : : }
1448 : :
1449 : : if (!i915_vma_is_pinned(ce->state)) {
1450 : : GEM_TRACE_ERR("Unpinned context:%llx in pending[%zd]\n",
1451 : : ce->timeline->fence_context,
1452 : : port - execlists->pending);
1453 : : ok = false;
1454 : : goto unlock;
1455 : : }
1456 : :
1457 : : if (!i915_vma_is_pinned(ce->ring->vma)) {
1458 : : GEM_TRACE_ERR("Unpinned ring:%llx in pending[%zd]\n",
1459 : : ce->timeline->fence_context,
1460 : : port - execlists->pending);
1461 : : ok = false;
1462 : : goto unlock;
1463 : : }
1464 : :
1465 : : unlock:
1466 : : spin_unlock_irqrestore(&rq->lock, flags);
1467 : : if (!ok)
1468 : : return false;
1469 : : }
1470 : :
1471 : : return ce;
1472 : : }
1473 : :
1474 : 0 : static void execlists_submit_ports(struct intel_engine_cs *engine)
1475 : : {
1476 : 0 : struct intel_engine_execlists *execlists = &engine->execlists;
1477 : 0 : unsigned int n;
1478 : :
1479 : 0 : GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));
1480 : :
1481 : : /*
1482 : : * We can skip acquiring intel_runtime_pm_get() here as it was taken
1483 : : * on our behalf by the request (see i915_gem_mark_busy()) and it will
1484 : : * not be relinquished until the device is idle (see
1485 : : * i915_gem_idle_work_handler()). As a precaution, we make sure
1486 : : * that all ELSP are drained i.e. we have processed the CSB,
1487 : : * before allowing ourselves to idle and calling intel_runtime_pm_put().
1488 : : */
1489 : 0 : GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
1490 : :
1491 : : /*
1492 : : * ELSQ note: the submit queue is not cleared after being submitted
1493 : : * to the HW so we need to make sure we always clean it up. This is
1494 : : * currently ensured by the fact that we always write the same number
1495 : : * of elsq entries, keep this in mind before changing the loop below.
1496 : : */
1497 [ # # ]: 0 : for (n = execlists_num_ports(execlists); n--; ) {
1498 : 0 : struct i915_request *rq = execlists->pending[n];
1499 : :
1500 [ # # # # ]: 0 : write_desc(execlists,
1501 : 0 : rq ? execlists_update_context(rq) : 0,
1502 : : n);
1503 : : }
1504 : :
1505 : : /* we need to manually load the submit queue */
1506 [ # # ]: 0 : if (execlists->ctrl_reg)
1507 : 0 : writel(EL_CTRL_LOAD, execlists->ctrl_reg);
1508 : 0 : }
1509 : :
1510 : : static bool ctx_single_port_submission(const struct intel_context *ce)
1511 : : {
1512 : : return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
1513 : : intel_context_force_single_submission(ce));
1514 : : }
1515 : :
1516 : : static bool can_merge_ctx(const struct intel_context *prev,
1517 : : const struct intel_context *next)
1518 : : {
1519 : : if (prev != next)
1520 : : return false;
1521 : :
1522 : : if (ctx_single_port_submission(prev))
1523 : : return false;
1524 : :
1525 : : return true;
1526 : : }
1527 : :
1528 : : static bool can_merge_rq(const struct i915_request *prev,
1529 : : const struct i915_request *next)
1530 : : {
1531 : : GEM_BUG_ON(prev == next);
1532 : : GEM_BUG_ON(!assert_priority_queue(prev, next));
1533 : :
1534 : : /*
1535 : : * We do not submit known completed requests. Therefore if the next
1536 : : * request is already completed, we can pretend to merge it in
1537 : : * with the previous context (and we will skip updating the ELSP
1538 : : * and tracking). Thus hopefully keeping the ELSP full with active
1539 : : * contexts, despite the best efforts of preempt-to-busy to confuse
1540 : : * us.
1541 : : */
1542 : : if (i915_request_completed(next))
1543 : : return true;
1544 : :
1545 : : if (unlikely((prev->fence.flags ^ next->fence.flags) &
1546 : : (BIT(I915_FENCE_FLAG_NOPREEMPT) |
1547 : : BIT(I915_FENCE_FLAG_SENTINEL))))
1548 : : return false;
1549 : :
1550 : : if (!can_merge_ctx(prev->context, next->context))
1551 : : return false;
1552 : :
1553 : : return true;
1554 : : }
1555 : :
1556 : 0 : static void virtual_update_register_offsets(u32 *regs,
1557 : : struct intel_engine_cs *engine)
1558 : : {
1559 [ # # ]: 0 : set_offsets(regs, reg_offsets(engine), engine, false);
1560 : 0 : }
1561 : :
1562 : 0 : static bool virtual_matches(const struct virtual_engine *ve,
1563 : : const struct i915_request *rq,
1564 : : const struct intel_engine_cs *engine)
1565 : : {
1566 : 0 : const struct intel_engine_cs *inflight;
1567 : :
1568 : 0 : if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
1569 : : return false;
1570 : :
1571 : : /*
1572 : : * We track when the HW has completed saving the context image
1573 : : * (i.e. when we have seen the final CS event switching out of
1574 : : * the context) and must not overwrite the context image before
1575 : : * then. This restricts us to only using the active engine
1576 : : * while the previous virtualized request is inflight (so
1577 : : * we reuse the register offsets). This is a very small
1578 : : * hystersis on the greedy seelction algorithm.
1579 : : */
1580 : 0 : inflight = intel_context_inflight(&ve->context);
1581 [ # # # # ]: 0 : if (inflight && inflight != engine)
1582 : : return false;
1583 : :
1584 : : return true;
1585 : : }
1586 : :
1587 : 0 : static void virtual_xfer_breadcrumbs(struct virtual_engine *ve,
1588 : : struct intel_engine_cs *engine)
1589 : : {
1590 : 0 : struct intel_engine_cs *old = ve->siblings[0];
1591 : :
1592 : : /* All unattached (rq->engine == old) must already be completed */
1593 : :
1594 : 0 : spin_lock(&old->breadcrumbs.irq_lock);
1595 [ # # ]: 0 : if (!list_empty(&ve->context.signal_link)) {
1596 : 0 : list_move_tail(&ve->context.signal_link,
1597 : : &engine->breadcrumbs.signalers);
1598 : 0 : intel_engine_signal_breadcrumbs(engine);
1599 : : }
1600 : 0 : spin_unlock(&old->breadcrumbs.irq_lock);
1601 : 0 : }
1602 : :
1603 : : #define for_each_waiter(p__, rq__) \
1604 : : list_for_each_entry_lockless(p__, \
1605 : : &(rq__)->sched.waiters_list, \
1606 : : wait_link)
1607 : :
1608 : 0 : static void defer_request(struct i915_request *rq, struct list_head * const pl)
1609 : : {
1610 : 0 : LIST_HEAD(list);
1611 : :
1612 : : /*
1613 : : * We want to move the interrupted request to the back of
1614 : : * the round-robin list (i.e. its priority level), but
1615 : : * in doing so, we must then move all requests that were in
1616 : : * flight and were waiting for the interrupted request to
1617 : : * be run after it again.
1618 : : */
1619 : 0 : do {
1620 : 0 : struct i915_dependency *p;
1621 : :
1622 : 0 : GEM_BUG_ON(i915_request_is_active(rq));
1623 : 0 : list_move_tail(&rq->sched.link, pl);
1624 : :
1625 [ # # ]: 0 : for_each_waiter(p, rq) {
1626 : 0 : struct i915_request *w =
1627 : 0 : container_of(p->waiter, typeof(*w), sched);
1628 : :
1629 : : /* Leave semaphores spinning on the other engines */
1630 [ # # ]: 0 : if (w->engine != rq->engine)
1631 : 0 : continue;
1632 : :
1633 : : /* No waiter should start before its signaler */
1634 : 0 : GEM_BUG_ON(i915_request_started(w) &&
1635 : : !i915_request_completed(rq));
1636 : :
1637 : 0 : GEM_BUG_ON(i915_request_is_active(w));
1638 [ # # ]: 0 : if (!i915_request_is_ready(w))
1639 : 0 : continue;
1640 : :
1641 [ # # ]: 0 : if (rq_prio(w) < rq_prio(rq))
1642 : 0 : continue;
1643 : :
1644 : 0 : GEM_BUG_ON(rq_prio(w) > rq_prio(rq));
1645 : 0 : list_move_tail(&w->sched.link, &list);
1646 : : }
1647 : :
1648 [ # # ]: 0 : rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
1649 [ # # ]: 0 : } while (rq);
1650 : 0 : }
1651 : :
1652 : 0 : static void defer_active(struct intel_engine_cs *engine)
1653 : : {
1654 : 0 : struct i915_request *rq;
1655 : :
1656 : 0 : rq = __unwind_incomplete_requests(engine);
1657 [ # # ]: 0 : if (!rq)
1658 : : return;
1659 : :
1660 : 0 : defer_request(rq, i915_sched_lookup_priolist(engine, rq_prio(rq)));
1661 : : }
1662 : :
1663 : : static bool
1664 : 0 : need_timeslice(struct intel_engine_cs *engine, const struct i915_request *rq)
1665 : : {
1666 : 0 : int hint;
1667 : :
1668 [ # # ]: 0 : if (!intel_engine_has_timeslices(engine))
1669 : : return false;
1670 : :
1671 : 0 : hint = engine->execlists.queue_priority_hint;
1672 [ # # ]: 0 : if (!list_is_last(&rq->sched.link, &engine->active.requests))
1673 : 0 : hint = max(hint, rq_prio(list_next_entry(rq, sched.link)));
1674 : :
1675 : 0 : return hint >= effective_prio(rq);
1676 : : }
1677 : :
1678 : : static int
1679 : 0 : switch_prio(struct intel_engine_cs *engine, const struct i915_request *rq)
1680 : : {
1681 : 0 : if (list_is_last(&rq->sched.link, &engine->active.requests))
1682 : : return INT_MIN;
1683 : :
1684 : 0 : return rq_prio(list_next_entry(rq, sched.link));
1685 : : }
1686 : :
1687 : : static inline unsigned long
1688 : 0 : timeslice(const struct intel_engine_cs *engine)
1689 : : {
1690 : 0 : return READ_ONCE(engine->props.timeslice_duration_ms);
1691 : : }
1692 : :
1693 : : static unsigned long
1694 : 0 : active_timeslice(const struct intel_engine_cs *engine)
1695 : : {
1696 : 0 : const struct i915_request *rq = *engine->execlists.active;
1697 : :
1698 [ # # # # ]: 0 : if (!rq || i915_request_completed(rq))
1699 : 0 : return 0;
1700 : :
1701 [ # # ]: 0 : if (engine->execlists.switch_priority_hint < effective_prio(rq))
1702 : : return 0;
1703 : :
1704 : 0 : return timeslice(engine);
1705 : : }
1706 : :
1707 : 0 : static void set_timeslice(struct intel_engine_cs *engine)
1708 : : {
1709 [ # # ]: 0 : if (!intel_engine_has_timeslices(engine))
1710 : : return;
1711 : :
1712 : 0 : set_timer_ms(&engine->execlists.timer, active_timeslice(engine));
1713 : : }
1714 : :
1715 : 0 : static void start_timeslice(struct intel_engine_cs *engine)
1716 : : {
1717 : 0 : struct intel_engine_execlists *execlists = &engine->execlists;
1718 : :
1719 : 0 : execlists->switch_priority_hint = execlists->queue_priority_hint;
1720 : :
1721 : 0 : if (timer_pending(&execlists->timer))
1722 : : return;
1723 : :
1724 : 0 : set_timer_ms(&execlists->timer, timeslice(engine));
1725 : : }
1726 : :
1727 : 0 : static void record_preemption(struct intel_engine_execlists *execlists)
1728 : : {
1729 : 0 : (void)I915_SELFTEST_ONLY(execlists->preempt_hang.count++);
1730 : : }
1731 : :
1732 : 0 : static unsigned long active_preempt_timeout(struct intel_engine_cs *engine,
1733 : : const struct i915_request *rq)
1734 : : {
1735 : 0 : if (!rq)
1736 : : return 0;
1737 : :
1738 : : /* Force a fast reset for terminated contexts (ignoring sysfs!) */
1739 [ # # ]: 0 : if (unlikely(intel_context_is_banned(rq->context)))
1740 : : return 1;
1741 : :
1742 : 0 : return READ_ONCE(engine->props.preempt_timeout_ms);
1743 : : }
1744 : :
1745 : 0 : static void set_preempt_timeout(struct intel_engine_cs *engine,
1746 : : const struct i915_request *rq)
1747 : : {
1748 [ # # ]: 0 : if (!intel_engine_has_preempt_reset(engine))
1749 : : return;
1750 : :
1751 [ # # ]: 0 : set_timer_ms(&engine->execlists.preempt,
1752 : : active_preempt_timeout(engine, rq));
1753 : : }
1754 : :
1755 : 0 : static inline void clear_ports(struct i915_request **ports, int count)
1756 : : {
1757 : 0 : memset_p((void **)ports, NULL, count);
1758 : : }
1759 : :
1760 : 0 : static void execlists_dequeue(struct intel_engine_cs *engine)
1761 : : {
1762 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
1763 : 0 : struct i915_request **port = execlists->pending;
1764 : 0 : struct i915_request ** const last_port = port + execlists->port_mask;
1765 : 0 : struct i915_request * const *active;
1766 : 0 : struct i915_request *last;
1767 : 0 : struct rb_node *rb;
1768 : 0 : bool submit = false;
1769 : :
1770 : : /*
1771 : : * Hardware submission is through 2 ports. Conceptually each port
1772 : : * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
1773 : : * static for a context, and unique to each, so we only execute
1774 : : * requests belonging to a single context from each ring. RING_HEAD
1775 : : * is maintained by the CS in the context image, it marks the place
1776 : : * where it got up to last time, and through RING_TAIL we tell the CS
1777 : : * where we want to execute up to this time.
1778 : : *
1779 : : * In this list the requests are in order of execution. Consecutive
1780 : : * requests from the same context are adjacent in the ringbuffer. We
1781 : : * can combine these requests into a single RING_TAIL update:
1782 : : *
1783 : : * RING_HEAD...req1...req2
1784 : : * ^- RING_TAIL
1785 : : * since to execute req2 the CS must first execute req1.
1786 : : *
1787 : : * Our goal then is to point each port to the end of a consecutive
1788 : : * sequence of requests as being the most optimal (fewest wake ups
1789 : : * and context switches) submission.
1790 : : */
1791 : :
1792 [ # # ]: 0 : for (rb = rb_first_cached(&execlists->virtual); rb; ) {
1793 : 0 : struct virtual_engine *ve =
1794 : 0 : rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1795 [ # # ]: 0 : struct i915_request *rq = READ_ONCE(ve->request);
1796 : :
1797 [ # # ]: 0 : if (!rq) { /* lazily cleanup after another engine handled rq */
1798 : 0 : rb_erase_cached(rb, &execlists->virtual);
1799 : 0 : RB_CLEAR_NODE(rb);
1800 : 0 : rb = rb_first_cached(&execlists->virtual);
1801 : 0 : continue;
1802 : : }
1803 : :
1804 [ # # ]: 0 : if (!virtual_matches(ve, rq, engine)) {
1805 : 0 : rb = rb_next(rb);
1806 : 0 : continue;
1807 : : }
1808 : :
1809 : : break;
1810 : : }
1811 : :
1812 : : /*
1813 : : * If the queue is higher priority than the last
1814 : : * request in the currently active context, submit afresh.
1815 : : * We will resubmit again afterwards in case we need to split
1816 : : * the active context to interject the preemption request,
1817 : : * i.e. we will retrigger preemption following the ack in case
1818 : : * of trouble.
1819 : : */
1820 : 0 : active = READ_ONCE(execlists->active);
1821 [ # # # # ]: 0 : while ((last = *active) && i915_request_completed(last))
1822 : 0 : active++;
1823 : :
1824 [ # # ]: 0 : if (last) {
1825 [ # # ]: 0 : if (need_preempt(engine, last, rb)) {
1826 : 0 : ENGINE_TRACE(engine,
1827 : : "preempting last=%llx:%lld, prio=%d, hint=%d\n",
1828 : : last->fence.context,
1829 : : last->fence.seqno,
1830 : : last->sched.attr.priority,
1831 : : execlists->queue_priority_hint);
1832 : 0 : record_preemption(execlists);
1833 : :
1834 : : /*
1835 : : * Don't let the RING_HEAD advance past the breadcrumb
1836 : : * as we unwind (and until we resubmit) so that we do
1837 : : * not accidentally tell it to go backwards.
1838 : : */
1839 : 0 : ring_set_paused(engine, 1);
1840 : :
1841 : : /*
1842 : : * Note that we have not stopped the GPU at this point,
1843 : : * so we are unwinding the incomplete requests as they
1844 : : * remain inflight and so by the time we do complete
1845 : : * the preemption, some of the unwound requests may
1846 : : * complete!
1847 : : */
1848 : 0 : __unwind_incomplete_requests(engine);
1849 : :
1850 : 0 : last = NULL;
1851 [ # # # # ]: 0 : } else if (need_timeslice(engine, last) &&
1852 : : timer_expired(&engine->execlists.timer)) {
1853 : 0 : ENGINE_TRACE(engine,
1854 : : "expired last=%llx:%lld, prio=%d, hint=%d\n",
1855 : : last->fence.context,
1856 : : last->fence.seqno,
1857 : : last->sched.attr.priority,
1858 : : execlists->queue_priority_hint);
1859 : :
1860 : 0 : ring_set_paused(engine, 1);
1861 : 0 : defer_active(engine);
1862 : :
1863 : : /*
1864 : : * Unlike for preemption, if we rewind and continue
1865 : : * executing the same context as previously active,
1866 : : * the order of execution will remain the same and
1867 : : * the tail will only advance. We do not need to
1868 : : * force a full context restore, as a lite-restore
1869 : : * is sufficient to resample the monotonic TAIL.
1870 : : *
1871 : : * If we switch to any other context, similarly we
1872 : : * will not rewind TAIL of current context, and
1873 : : * normal save/restore will preserve state and allow
1874 : : * us to later continue executing the same request.
1875 : : */
1876 : 0 : last = NULL;
1877 : : } else {
1878 : : /*
1879 : : * Otherwise if we already have a request pending
1880 : : * for execution after the current one, we can
1881 : : * just wait until the next CS event before
1882 : : * queuing more. In either case we will force a
1883 : : * lite-restore preemption event, but if we wait
1884 : : * we hopefully coalesce several updates into a single
1885 : : * submission.
1886 : : */
1887 : 0 : if (!list_is_last(&last->sched.link,
1888 [ # # ]: 0 : &engine->active.requests)) {
1889 : : /*
1890 : : * Even if ELSP[1] is occupied and not worthy
1891 : : * of timeslices, our queue might be.
1892 : : */
1893 [ # # ]: 0 : start_timeslice(engine);
1894 : 0 : return;
1895 : : }
1896 : : }
1897 : : }
1898 : :
1899 [ # # ]: 0 : while (rb) { /* XXX virtual is always taking precedence */
1900 : 0 : struct virtual_engine *ve =
1901 : 0 : rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
1902 : 0 : struct i915_request *rq;
1903 : :
1904 : 0 : spin_lock(&ve->base.active.lock);
1905 : :
1906 : 0 : rq = ve->request;
1907 [ # # ]: 0 : if (unlikely(!rq)) { /* lost the race to a sibling */
1908 : 0 : spin_unlock(&ve->base.active.lock);
1909 : 0 : rb_erase_cached(rb, &execlists->virtual);
1910 : 0 : RB_CLEAR_NODE(rb);
1911 : 0 : rb = rb_first_cached(&execlists->virtual);
1912 : 0 : continue;
1913 : : }
1914 : :
1915 : 0 : GEM_BUG_ON(rq != ve->request);
1916 : 0 : GEM_BUG_ON(rq->engine != &ve->base);
1917 : 0 : GEM_BUG_ON(rq->context != &ve->context);
1918 : :
1919 [ # # # # ]: 0 : if (rq_prio(rq) >= queue_prio(execlists)) {
1920 [ # # ]: 0 : if (!virtual_matches(ve, rq, engine)) {
1921 : 0 : spin_unlock(&ve->base.active.lock);
1922 : 0 : rb = rb_next(rb);
1923 : 0 : continue;
1924 : : }
1925 : :
1926 [ # # # # ]: 0 : if (last && !can_merge_rq(last, rq)) {
1927 : 0 : spin_unlock(&ve->base.active.lock);
1928 [ # # ]: 0 : start_timeslice(engine);
1929 : 0 : return; /* leave this for another sibling */
1930 : : }
1931 : :
1932 : 0 : ENGINE_TRACE(engine,
1933 : : "virtual rq=%llx:%lld%s, new engine? %s\n",
1934 : : rq->fence.context,
1935 : : rq->fence.seqno,
1936 : : i915_request_completed(rq) ? "!" :
1937 : : i915_request_started(rq) ? "*" :
1938 : : "",
1939 : : yesno(engine != ve->siblings[0]));
1940 : :
1941 : 0 : ve->request = NULL;
1942 : 0 : ve->base.execlists.queue_priority_hint = INT_MIN;
1943 : 0 : rb_erase_cached(rb, &execlists->virtual);
1944 : 0 : RB_CLEAR_NODE(rb);
1945 : :
1946 : 0 : GEM_BUG_ON(!(rq->execution_mask & engine->mask));
1947 : 0 : rq->engine = engine;
1948 : :
1949 [ # # ]: 0 : if (engine != ve->siblings[0]) {
1950 : 0 : u32 *regs = ve->context.lrc_reg_state;
1951 : 0 : unsigned int n;
1952 : :
1953 : 0 : GEM_BUG_ON(READ_ONCE(ve->context.inflight));
1954 : :
1955 [ # # ]: 0 : if (!intel_engine_has_relative_mmio(engine))
1956 : 0 : virtual_update_register_offsets(regs,
1957 : : engine);
1958 : :
1959 [ # # ]: 0 : if (!list_empty(&ve->context.signals))
1960 : 0 : virtual_xfer_breadcrumbs(ve, engine);
1961 : :
1962 : : /*
1963 : : * Move the bound engine to the top of the list
1964 : : * for future execution. We then kick this
1965 : : * tasklet first before checking others, so that
1966 : : * we preferentially reuse this set of bound
1967 : : * registers.
1968 : : */
1969 [ # # ]: 0 : for (n = 1; n < ve->num_siblings; n++) {
1970 [ # # ]: 0 : if (ve->siblings[n] == engine) {
1971 : 0 : swap(ve->siblings[n],
1972 : : ve->siblings[0]);
1973 : 0 : break;
1974 : : }
1975 : : }
1976 : :
1977 : 0 : GEM_BUG_ON(ve->siblings[0] != engine);
1978 : : }
1979 : :
1980 [ # # ]: 0 : if (__i915_request_submit(rq)) {
1981 : 0 : submit = true;
1982 : 0 : last = rq;
1983 : : }
1984 : 0 : i915_request_put(rq);
1985 : :
1986 : : /*
1987 : : * Hmm, we have a bunch of virtual engine requests,
1988 : : * but the first one was already completed (thanks
1989 : : * preempt-to-busy!). Keep looking at the veng queue
1990 : : * until we have no more relevant requests (i.e.
1991 : : * the normal submit queue has higher priority).
1992 : : */
1993 [ # # ]: 0 : if (!submit) {
1994 : 0 : spin_unlock(&ve->base.active.lock);
1995 : 0 : rb = rb_first_cached(&execlists->virtual);
1996 : 0 : continue;
1997 : : }
1998 : : }
1999 : :
2000 : 0 : spin_unlock(&ve->base.active.lock);
2001 : : break;
2002 : : }
2003 : :
2004 [ # # ]: 0 : while ((rb = rb_first_cached(&execlists->queue))) {
2005 : 0 : struct i915_priolist *p = to_priolist(rb);
2006 : 0 : struct i915_request *rq, *rn;
2007 : 0 : int i;
2008 : :
2009 [ # # # # ]: 0 : priolist_for_each_request_consume(rq, rn, p, i) {
2010 : 0 : bool merge = true;
2011 : :
2012 : : /*
2013 : : * Can we combine this request with the current port?
2014 : : * It has to be the same context/ringbuffer and not
2015 : : * have any exceptions (e.g. GVT saying never to
2016 : : * combine contexts).
2017 : : *
2018 : : * If we can combine the requests, we can execute both
2019 : : * by updating the RING_TAIL to point to the end of the
2020 : : * second request, and so we never need to tell the
2021 : : * hardware about the first.
2022 : : */
2023 [ # # # # ]: 0 : if (last && !can_merge_rq(last, rq)) {
2024 : : /*
2025 : : * If we are on the second port and cannot
2026 : : * combine this request with the last, then we
2027 : : * are done.
2028 : : */
2029 [ # # ]: 0 : if (port == last_port)
2030 : 0 : goto done;
2031 : :
2032 : : /*
2033 : : * We must not populate both ELSP[] with the
2034 : : * same LRCA, i.e. we must submit 2 different
2035 : : * contexts if we submit 2 ELSP.
2036 : : */
2037 [ # # ]: 0 : if (last->context == rq->context)
2038 : 0 : goto done;
2039 : :
2040 [ # # ]: 0 : if (i915_request_has_sentinel(last))
2041 : 0 : goto done;
2042 : :
2043 : : /*
2044 : : * If GVT overrides us we only ever submit
2045 : : * port[0], leaving port[1] empty. Note that we
2046 : : * also have to be careful that we don't queue
2047 : : * the same context (even though a different
2048 : : * request) to the second port.
2049 : : */
2050 : : if (ctx_single_port_submission(last->context) ||
2051 : : ctx_single_port_submission(rq->context))
2052 : : goto done;
2053 : :
2054 : : merge = false;
2055 : : }
2056 : :
2057 [ # # ]: 0 : if (__i915_request_submit(rq)) {
2058 [ # # ]: 0 : if (!merge) {
2059 : 0 : *port = execlists_schedule_in(last, port - execlists->pending);
2060 : 0 : port++;
2061 : 0 : last = NULL;
2062 : : }
2063 : :
2064 : : GEM_BUG_ON(last &&
2065 : : !can_merge_ctx(last->context,
2066 : : rq->context));
2067 : :
2068 : : submit = true;
2069 : : last = rq;
2070 : : }
2071 : : }
2072 : :
2073 : 0 : rb_erase_cached(&p->node, &execlists->queue);
2074 [ # # ]: 0 : i915_priolist_free(p);
2075 : : }
2076 : :
2077 : 0 : done:
2078 : : /*
2079 : : * Here be a bit of magic! Or sleight-of-hand, whichever you prefer.
2080 : : *
2081 : : * We choose the priority hint such that if we add a request of greater
2082 : : * priority than this, we kick the submission tasklet to decide on
2083 : : * the right order of submitting the requests to hardware. We must
2084 : : * also be prepared to reorder requests as they are in-flight on the
2085 : : * HW. We derive the priority hint then as the first "hole" in
2086 : : * the HW submission ports and if there are no available slots,
2087 : : * the priority of the lowest executing request, i.e. last.
2088 : : *
2089 : : * When we do receive a higher priority request ready to run from the
2090 : : * user, see queue_request(), the priority hint is bumped to that
2091 : : * request triggering preemption on the next dequeue (or subsequent
2092 : : * interrupt for secondary ports).
2093 : : */
2094 [ # # ]: 0 : execlists->queue_priority_hint = queue_prio(execlists);
2095 : :
2096 [ # # ]: 0 : if (submit) {
2097 : 0 : *port = execlists_schedule_in(last, port - execlists->pending);
2098 : 0 : execlists->switch_priority_hint =
2099 [ # # ]: 0 : switch_prio(engine, *execlists->pending);
2100 : :
2101 : : /*
2102 : : * Skip if we ended up with exactly the same set of requests,
2103 : : * e.g. trying to timeslice a pair of ordered contexts
2104 : : */
2105 : 0 : if (!memcmp(active, execlists->pending,
2106 [ # # ]: 0 : (port - execlists->pending + 1) * sizeof(*port))) {
2107 : 0 : do
2108 : 0 : execlists_schedule_out(fetch_and_zero(port));
2109 [ # # ]: 0 : while (port-- != execlists->pending);
2110 : :
2111 : 0 : goto skip_submit;
2112 : : }
2113 : 0 : clear_ports(port + 1, last_port - port);
2114 : :
2115 : 0 : execlists_submit_ports(engine);
2116 : 0 : set_preempt_timeout(engine, *active);
2117 : : } else {
2118 : 0 : skip_submit:
2119 : 0 : ring_set_paused(engine, 0);
2120 : : }
2121 : : }
2122 : :
2123 : : static void
2124 : 0 : cancel_port_requests(struct intel_engine_execlists * const execlists)
2125 : : {
2126 : 0 : struct i915_request * const *port;
2127 : :
2128 [ # # ]: 0 : for (port = execlists->pending; *port; port++)
2129 : 0 : execlists_schedule_out(*port);
2130 : 0 : clear_ports(execlists->pending, ARRAY_SIZE(execlists->pending));
2131 : :
2132 : : /* Mark the end of active before we overwrite *active */
2133 [ # # ]: 0 : for (port = xchg(&execlists->active, execlists->pending); *port; port++)
2134 : 0 : execlists_schedule_out(*port);
2135 : 0 : clear_ports(execlists->inflight, ARRAY_SIZE(execlists->inflight));
2136 : :
2137 : 0 : WRITE_ONCE(execlists->active, execlists->inflight);
2138 : 0 : }
2139 : :
2140 : : static inline void
2141 : 0 : invalidate_csb_entries(const u32 *first, const u32 *last)
2142 : : {
2143 : 0 : clflush((void *)first);
2144 : 0 : clflush((void *)last);
2145 : 0 : }
2146 : :
2147 : : static inline bool
2148 : 0 : reset_in_progress(const struct intel_engine_execlists *execlists)
2149 : : {
2150 : 0 : return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
2151 : : }
2152 : :
2153 : : /*
2154 : : * Starting with Gen12, the status has a new format:
2155 : : *
2156 : : * bit 0: switched to new queue
2157 : : * bit 1: reserved
2158 : : * bit 2: semaphore wait mode (poll or signal), only valid when
2159 : : * switch detail is set to "wait on semaphore"
2160 : : * bits 3-5: engine class
2161 : : * bits 6-11: engine instance
2162 : : * bits 12-14: reserved
2163 : : * bits 15-25: sw context id of the lrc the GT switched to
2164 : : * bits 26-31: sw counter of the lrc the GT switched to
2165 : : * bits 32-35: context switch detail
2166 : : * - 0: ctx complete
2167 : : * - 1: wait on sync flip
2168 : : * - 2: wait on vblank
2169 : : * - 3: wait on scanline
2170 : : * - 4: wait on semaphore
2171 : : * - 5: context preempted (not on SEMAPHORE_WAIT or
2172 : : * WAIT_FOR_EVENT)
2173 : : * bit 36: reserved
2174 : : * bits 37-43: wait detail (for switch detail 1 to 4)
2175 : : * bits 44-46: reserved
2176 : : * bits 47-57: sw context id of the lrc the GT switched away from
2177 : : * bits 58-63: sw counter of the lrc the GT switched away from
2178 : : */
2179 : : static inline bool
2180 : 0 : gen12_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2181 : : {
2182 : 0 : u32 lower_dw = csb[0];
2183 : 0 : u32 upper_dw = csb[1];
2184 : 0 : bool ctx_to_valid = GEN12_CSB_CTX_VALID(lower_dw);
2185 : 0 : bool ctx_away_valid = GEN12_CSB_CTX_VALID(upper_dw);
2186 : 0 : bool new_queue = lower_dw & GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE;
2187 : :
2188 : : /*
2189 : : * The context switch detail is not guaranteed to be 5 when a preemption
2190 : : * occurs, so we can't just check for that. The check below works for
2191 : : * all the cases we care about, including preemptions of WAIT
2192 : : * instructions and lite-restore. Preempt-to-idle via the CTRL register
2193 : : * would require some extra handling, but we don't support that.
2194 : : */
2195 : 0 : if (!ctx_away_valid || new_queue) {
2196 : : GEM_BUG_ON(!ctx_to_valid);
2197 : : return true;
2198 : : }
2199 : :
2200 : : /*
2201 : : * switch detail = 5 is covered by the case above and we do not expect a
2202 : : * context switch on an unsuccessful wait instruction since we always
2203 : : * use polling mode.
2204 : : */
2205 : : GEM_BUG_ON(GEN12_CTX_SWITCH_DETAIL(upper_dw));
2206 : : return false;
2207 : : }
2208 : :
2209 : : static inline bool
2210 : 0 : gen8_csb_parse(const struct intel_engine_execlists *execlists, const u32 *csb)
2211 : : {
2212 : 0 : return *csb & (GEN8_CTX_STATUS_IDLE_ACTIVE | GEN8_CTX_STATUS_PREEMPTED);
2213 : : }
2214 : :
2215 : 0 : static void process_csb(struct intel_engine_cs *engine)
2216 : : {
2217 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
2218 : 0 : const u32 * const buf = execlists->csb_status;
2219 : 0 : const u8 num_entries = execlists->csb_size;
2220 : 0 : u8 head, tail;
2221 : :
2222 : : /*
2223 : : * As we modify our execlists state tracking we require exclusive
2224 : : * access. Either we are inside the tasklet, or the tasklet is disabled
2225 : : * and we assume that is only inside the reset paths and so serialised.
2226 : : */
2227 : 0 : GEM_BUG_ON(!tasklet_is_locked(&execlists->tasklet) &&
2228 : : !reset_in_progress(execlists));
2229 : 0 : GEM_BUG_ON(!intel_engine_in_execlists_submission_mode(engine));
2230 : :
2231 : : /*
2232 : : * Note that csb_write, csb_status may be either in HWSP or mmio.
2233 : : * When reading from the csb_write mmio register, we have to be
2234 : : * careful to only use the GEN8_CSB_WRITE_PTR portion, which is
2235 : : * the low 4bits. As it happens we know the next 4bits are always
2236 : : * zero and so we can simply masked off the low u8 of the register
2237 : : * and treat it identically to reading from the HWSP (without having
2238 : : * to use explicit shifting and masking, and probably bifurcating
2239 : : * the code to handle the legacy mmio read).
2240 : : */
2241 : 0 : head = execlists->csb_head;
2242 [ # # ]: 0 : tail = READ_ONCE(*execlists->csb_write);
2243 : 0 : ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2244 [ # # ]: 0 : if (unlikely(head == tail))
2245 : : return;
2246 : :
2247 : : /*
2248 : : * Hopefully paired with a wmb() in HW!
2249 : : *
2250 : : * We must complete the read of the write pointer before any reads
2251 : : * from the CSB, so that we do not see stale values. Without an rmb
2252 : : * (lfence) the HW may speculatively perform the CSB[] reads *before*
2253 : : * we perform the READ_ONCE(*csb_write).
2254 : : */
2255 : 0 : rmb();
2256 : :
2257 : 0 : do {
2258 : 0 : bool promote;
2259 : :
2260 [ # # ]: 0 : if (++head == num_entries)
2261 : 0 : head = 0;
2262 : :
2263 : : /*
2264 : : * We are flying near dragons again.
2265 : : *
2266 : : * We hold a reference to the request in execlist_port[]
2267 : : * but no more than that. We are operating in softirq
2268 : : * context and so cannot hold any mutex or sleep. That
2269 : : * prevents us stopping the requests we are processing
2270 : : * in port[] from being retired simultaneously (the
2271 : : * breadcrumb will be complete before we see the
2272 : : * context-switch). As we only hold the reference to the
2273 : : * request, any pointer chasing underneath the request
2274 : : * is subject to a potential use-after-free. Thus we
2275 : : * store all of the bookkeeping within port[] as
2276 : : * required, and avoid using unguarded pointers beneath
2277 : : * request itself. The same applies to the atomic
2278 : : * status notifier.
2279 : : */
2280 : :
2281 : 0 : ENGINE_TRACE(engine, "csb[%d]: status=0x%08x:0x%08x\n",
2282 : : head, buf[2 * head + 0], buf[2 * head + 1]);
2283 : :
2284 [ # # ]: 0 : if (INTEL_GEN(engine->i915) >= 12)
2285 [ # # ]: 0 : promote = gen12_csb_parse(execlists, buf + 2 * head);
2286 : : else
2287 : 0 : promote = gen8_csb_parse(execlists, buf + 2 * head);
2288 [ # # ]: 0 : if (promote) {
2289 : 0 : struct i915_request * const *old = execlists->active;
2290 : :
2291 : : /* Point active to the new ELSP; prevent overwriting */
2292 : 0 : WRITE_ONCE(execlists->active, execlists->pending);
2293 : :
2294 : 0 : if (!inject_preempt_hang(execlists))
2295 : 0 : ring_set_paused(engine, 0);
2296 : :
2297 : : /* cancel old inflight, prepare for switch */
2298 : 0 : trace_ports(execlists, "preempted", old);
2299 [ # # ]: 0 : while (*old)
2300 : 0 : execlists_schedule_out(*old++);
2301 : :
2302 : : /* switch pending to inflight */
2303 : 0 : GEM_BUG_ON(!assert_pending_valid(execlists, "promote"));
2304 : 0 : WRITE_ONCE(execlists->active,
2305 : : memcpy(execlists->inflight,
2306 : : execlists->pending,
2307 : : execlists_num_ports(execlists) *
2308 : : sizeof(*execlists->pending)));
2309 : :
2310 : 0 : WRITE_ONCE(execlists->pending[0], NULL);
2311 : : } else {
2312 : 0 : GEM_BUG_ON(!*execlists->active);
2313 : :
2314 : : /* port0 completed, advanced to port1 */
2315 : 0 : trace_ports(execlists, "completed", execlists->active);
2316 : :
2317 : : /*
2318 : : * We rely on the hardware being strongly
2319 : : * ordered, that the breadcrumb write is
2320 : : * coherent (visible from the CPU) before the
2321 : : * user interrupt and CSB is processed.
2322 : : */
2323 : 0 : GEM_BUG_ON(!i915_request_completed(*execlists->active) &&
2324 : : !reset_in_progress(execlists));
2325 : 0 : execlists_schedule_out(*execlists->active++);
2326 : :
2327 : 0 : GEM_BUG_ON(execlists->active - execlists->inflight >
2328 : : execlists_num_ports(execlists));
2329 : : }
2330 [ # # ]: 0 : } while (head != tail);
2331 : :
2332 : 0 : execlists->csb_head = head;
2333 : 0 : set_timeslice(engine);
2334 : :
2335 : : /*
2336 : : * Gen11 has proven to fail wrt global observation point between
2337 : : * entry and tail update, failing on the ordering and thus
2338 : : * we see an old entry in the context status buffer.
2339 : : *
2340 : : * Forcibly evict out entries for the next gpu csb update,
2341 : : * to increase the odds that we get a fresh entries with non
2342 : : * working hardware. The cost for doing so comes out mostly with
2343 : : * the wash as hardware, working or not, will need to do the
2344 : : * invalidation before.
2345 : : */
2346 : 0 : invalidate_csb_entries(&buf[0], &buf[num_entries - 1]);
2347 : : }
2348 : :
2349 : 0 : static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
2350 : : {
2351 : 0 : lockdep_assert_held(&engine->active.lock);
2352 : 0 : if (!engine->execlists.pending[0]) {
2353 : 0 : rcu_read_lock(); /* protect peeking at execlists->active */
2354 : 0 : execlists_dequeue(engine);
2355 : 0 : rcu_read_unlock();
2356 : : }
2357 : : }
2358 : :
2359 : 0 : static void __execlists_hold(struct i915_request *rq)
2360 : : {
2361 : 0 : LIST_HEAD(list);
2362 : :
2363 : 0 : do {
2364 : 0 : struct i915_dependency *p;
2365 : :
2366 [ # # ]: 0 : if (i915_request_is_active(rq))
2367 : 0 : __i915_request_unsubmit(rq);
2368 : :
2369 : 0 : RQ_TRACE(rq, "on hold\n");
2370 : 0 : clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2371 : 0 : list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2372 : 0 : i915_request_set_hold(rq);
2373 : :
2374 [ # # ]: 0 : list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2375 : 0 : struct i915_request *w =
2376 : 0 : container_of(p->waiter, typeof(*w), sched);
2377 : :
2378 : : /* Leave semaphores spinning on the other engines */
2379 [ # # ]: 0 : if (w->engine != rq->engine)
2380 : 0 : continue;
2381 : :
2382 [ # # ]: 0 : if (!i915_request_is_ready(w))
2383 : 0 : continue;
2384 : :
2385 [ # # ]: 0 : if (i915_request_completed(w))
2386 : 0 : continue;
2387 : :
2388 [ # # ]: 0 : if (i915_request_on_hold(rq))
2389 : 0 : continue;
2390 : :
2391 : 0 : list_move_tail(&w->sched.link, &list);
2392 : : }
2393 : :
2394 [ # # ]: 0 : rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2395 [ # # ]: 0 : } while (rq);
2396 : 0 : }
2397 : :
2398 : 0 : static bool execlists_hold(struct intel_engine_cs *engine,
2399 : : struct i915_request *rq)
2400 : : {
2401 : 0 : spin_lock_irq(&engine->active.lock);
2402 : :
2403 [ # # ]: 0 : if (i915_request_completed(rq)) { /* too late! */
2404 : 0 : rq = NULL;
2405 : 0 : goto unlock;
2406 : : }
2407 : :
2408 [ # # ]: 0 : if (rq->engine != engine) { /* preempted virtual engine */
2409 : 0 : struct virtual_engine *ve = to_virtual_engine(rq->engine);
2410 : :
2411 : : /*
2412 : : * intel_context_inflight() is only protected by virtue
2413 : : * of process_csb() being called only by the tasklet (or
2414 : : * directly from inside reset while the tasklet is suspended).
2415 : : * Assert that neither of those are allowed to run while we
2416 : : * poke at the request queues.
2417 : : */
2418 : 0 : GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2419 : :
2420 : : /*
2421 : : * An unsubmitted request along a virtual engine will
2422 : : * remain on the active (this) engine until we are able
2423 : : * to process the context switch away (and so mark the
2424 : : * context as no longer in flight). That cannot have happened
2425 : : * yet, otherwise we would not be hanging!
2426 : : */
2427 : 0 : spin_lock(&ve->base.active.lock);
2428 : 0 : GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2429 : 0 : GEM_BUG_ON(ve->request != rq);
2430 : 0 : ve->request = NULL;
2431 : 0 : spin_unlock(&ve->base.active.lock);
2432 : 0 : i915_request_put(rq);
2433 : :
2434 : 0 : rq->engine = engine;
2435 : : }
2436 : :
2437 : : /*
2438 : : * Transfer this request onto the hold queue to prevent it
2439 : : * being resumbitted to HW (and potentially completed) before we have
2440 : : * released it. Since we may have already submitted following
2441 : : * requests, we need to remove those as well.
2442 : : */
2443 : 0 : GEM_BUG_ON(i915_request_on_hold(rq));
2444 : 0 : GEM_BUG_ON(rq->engine != engine);
2445 : 0 : __execlists_hold(rq);
2446 : :
2447 : 0 : unlock:
2448 : 0 : spin_unlock_irq(&engine->active.lock);
2449 : 0 : return rq;
2450 : : }
2451 : :
2452 : 0 : static bool hold_request(const struct i915_request *rq)
2453 : : {
2454 : 0 : struct i915_dependency *p;
2455 : :
2456 : : /*
2457 : : * If one of our ancestors is on hold, we must also be on hold,
2458 : : * otherwise we will bypass it and execute before it.
2459 : : */
2460 [ # # ]: 0 : list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2461 : 0 : const struct i915_request *s =
2462 : 0 : container_of(p->signaler, typeof(*s), sched);
2463 : :
2464 [ # # ]: 0 : if (s->engine != rq->engine)
2465 : 0 : continue;
2466 : :
2467 [ # # ]: 0 : if (i915_request_on_hold(s))
2468 : : return true;
2469 : : }
2470 : :
2471 : : return false;
2472 : : }
2473 : :
2474 : 0 : static void __execlists_unhold(struct i915_request *rq)
2475 : : {
2476 : 0 : LIST_HEAD(list);
2477 : :
2478 : 0 : do {
2479 : 0 : struct i915_dependency *p;
2480 : :
2481 : 0 : GEM_BUG_ON(!i915_request_on_hold(rq));
2482 : 0 : GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2483 : :
2484 : 0 : i915_request_clear_hold(rq);
2485 : 0 : list_move_tail(&rq->sched.link,
2486 : : i915_sched_lookup_priolist(rq->engine,
2487 : : rq_prio(rq)));
2488 : 0 : set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2489 : 0 : RQ_TRACE(rq, "hold release\n");
2490 : :
2491 : : /* Also release any children on this engine that are ready */
2492 [ # # ]: 0 : list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2493 : 0 : struct i915_request *w =
2494 : 0 : container_of(p->waiter, typeof(*w), sched);
2495 : :
2496 [ # # ]: 0 : if (w->engine != rq->engine)
2497 : 0 : continue;
2498 : :
2499 [ # # ]: 0 : if (!i915_request_on_hold(rq))
2500 : 0 : continue;
2501 : :
2502 : : /* Check that no other parents are also on hold */
2503 [ # # ]: 0 : if (hold_request(rq))
2504 : 0 : continue;
2505 : :
2506 : 0 : list_move_tail(&w->sched.link, &list);
2507 : : }
2508 : :
2509 [ # # ]: 0 : rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2510 [ # # ]: 0 : } while (rq);
2511 : 0 : }
2512 : :
2513 : 0 : static void execlists_unhold(struct intel_engine_cs *engine,
2514 : : struct i915_request *rq)
2515 : : {
2516 : 0 : spin_lock_irq(&engine->active.lock);
2517 : :
2518 : : /*
2519 : : * Move this request back to the priority queue, and all of its
2520 : : * children and grandchildren that were suspended along with it.
2521 : : */
2522 : 0 : __execlists_unhold(rq);
2523 : :
2524 [ # # ]: 0 : if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2525 : 0 : engine->execlists.queue_priority_hint = rq_prio(rq);
2526 : 0 : tasklet_hi_schedule(&engine->execlists.tasklet);
2527 : : }
2528 : :
2529 : 0 : spin_unlock_irq(&engine->active.lock);
2530 : 0 : }
2531 : :
2532 : : struct execlists_capture {
2533 : : struct work_struct work;
2534 : : struct i915_request *rq;
2535 : : struct i915_gpu_coredump *error;
2536 : : };
2537 : :
2538 : 0 : static void execlists_capture_work(struct work_struct *work)
2539 : : {
2540 : 0 : struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2541 : 0 : const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2542 : 0 : struct intel_engine_cs *engine = cap->rq->engine;
2543 : 0 : struct intel_gt_coredump *gt = cap->error->gt;
2544 : 0 : struct intel_engine_capture_vma *vma;
2545 : :
2546 : : /* Compress all the objects attached to the request, slow! */
2547 : 0 : vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2548 [ # # ]: 0 : if (vma) {
2549 : 0 : struct i915_vma_compress *compress =
2550 : 0 : i915_vma_capture_prepare(gt);
2551 : :
2552 : 0 : intel_engine_coredump_add_vma(gt->engine, vma, compress);
2553 : 0 : i915_vma_capture_finish(gt, compress);
2554 : : }
2555 : :
2556 : 0 : gt->simulated = gt->engine->simulated;
2557 : 0 : cap->error->simulated = gt->simulated;
2558 : :
2559 : : /* Publish the error state, and announce it to the world */
2560 : 0 : i915_error_state_store(cap->error);
2561 : 0 : i915_gpu_coredump_put(cap->error);
2562 : :
2563 : : /* Return this request and all that depend upon it for signaling */
2564 : 0 : execlists_unhold(engine, cap->rq);
2565 : 0 : i915_request_put(cap->rq);
2566 : :
2567 : 0 : kfree(cap);
2568 : 0 : }
2569 : :
2570 : 0 : static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2571 : : {
2572 : 0 : const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2573 : 0 : struct execlists_capture *cap;
2574 : :
2575 : 0 : cap = kmalloc(sizeof(*cap), gfp);
2576 [ # # ]: 0 : if (!cap)
2577 : : return NULL;
2578 : :
2579 : 0 : cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2580 [ # # ]: 0 : if (!cap->error)
2581 : 0 : goto err_cap;
2582 : :
2583 : 0 : cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2584 [ # # ]: 0 : if (!cap->error->gt)
2585 : 0 : goto err_gpu;
2586 : :
2587 : 0 : cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2588 [ # # ]: 0 : if (!cap->error->gt->engine)
2589 : 0 : goto err_gt;
2590 : :
2591 : : return cap;
2592 : :
2593 : : err_gt:
2594 : 0 : kfree(cap->error->gt);
2595 : 0 : err_gpu:
2596 : 0 : kfree(cap->error);
2597 : 0 : err_cap:
2598 : 0 : kfree(cap);
2599 : 0 : return NULL;
2600 : : }
2601 : :
2602 : 0 : static bool execlists_capture(struct intel_engine_cs *engine)
2603 : : {
2604 : 0 : struct execlists_capture *cap;
2605 : :
2606 : 0 : if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2607 : : return true;
2608 : :
2609 : : /*
2610 : : * We need to _quickly_ capture the engine state before we reset.
2611 : : * We are inside an atomic section (softirq) here and we are delaying
2612 : : * the forced preemption event.
2613 : : */
2614 : 0 : cap = capture_regs(engine);
2615 [ # # ]: 0 : if (!cap)
2616 : : return true;
2617 : :
2618 : 0 : cap->rq = execlists_active(&engine->execlists);
2619 : 0 : GEM_BUG_ON(!cap->rq);
2620 : :
2621 : 0 : rcu_read_lock();
2622 : 0 : cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2623 : 0 : cap->rq = i915_request_get_rcu(cap->rq);
2624 : 0 : rcu_read_unlock();
2625 [ # # ]: 0 : if (!cap->rq)
2626 : 0 : goto err_free;
2627 : :
2628 : : /*
2629 : : * Remove the request from the execlists queue, and take ownership
2630 : : * of the request. We pass it to our worker who will _slowly_ compress
2631 : : * all the pages the _user_ requested for debugging their batch, after
2632 : : * which we return it to the queue for signaling.
2633 : : *
2634 : : * By removing them from the execlists queue, we also remove the
2635 : : * requests from being processed by __unwind_incomplete_requests()
2636 : : * during the intel_engine_reset(), and so they will *not* be replayed
2637 : : * afterwards.
2638 : : *
2639 : : * Note that because we have not yet reset the engine at this point,
2640 : : * it is possible for the request that we have identified as being
2641 : : * guilty, did in fact complete and we will then hit an arbitration
2642 : : * point allowing the outstanding preemption to succeed. The likelihood
2643 : : * of that is very low (as capturing of the engine registers should be
2644 : : * fast enough to run inside an irq-off atomic section!), so we will
2645 : : * simply hold that request accountable for being non-preemptible
2646 : : * long enough to force the reset.
2647 : : */
2648 [ # # ]: 0 : if (!execlists_hold(engine, cap->rq))
2649 : 0 : goto err_rq;
2650 : :
2651 : 0 : INIT_WORK(&cap->work, execlists_capture_work);
2652 : 0 : schedule_work(&cap->work);
2653 : 0 : return true;
2654 : :
2655 : : err_rq:
2656 : 0 : i915_request_put(cap->rq);
2657 : 0 : err_free:
2658 : 0 : i915_gpu_coredump_put(cap->error);
2659 : 0 : kfree(cap);
2660 : 0 : return false;
2661 : : }
2662 : :
2663 : 0 : static noinline void preempt_reset(struct intel_engine_cs *engine)
2664 : : {
2665 : 0 : const unsigned int bit = I915_RESET_ENGINE + engine->id;
2666 : 0 : unsigned long *lock = &engine->gt->reset.flags;
2667 : :
2668 [ # # ]: 0 : if (i915_modparams.reset < 3)
2669 : : return;
2670 : :
2671 [ # # ]: 0 : if (test_and_set_bit(bit, lock))
2672 : : return;
2673 : :
2674 : : /* Mark this tasklet as disabled to avoid waiting for it to complete */
2675 : 0 : tasklet_disable_nosync(&engine->execlists.tasklet);
2676 : :
2677 : 0 : ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
2678 : : READ_ONCE(engine->props.preempt_timeout_ms),
2679 : : jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2680 : :
2681 : 0 : ring_set_paused(engine, 1); /* Freeze the current request in place */
2682 [ # # ]: 0 : if (execlists_capture(engine))
2683 : 0 : intel_engine_reset(engine, "preemption time out");
2684 : : else
2685 : 0 : ring_set_paused(engine, 0);
2686 : :
2687 : 0 : tasklet_enable(&engine->execlists.tasklet);
2688 : 0 : clear_and_wake_up_bit(bit, lock);
2689 : : }
2690 : :
2691 : 0 : static bool preempt_timeout(const struct intel_engine_cs *const engine)
2692 : : {
2693 : 0 : const struct timer_list *t = &engine->execlists.preempt;
2694 : :
2695 : 0 : if (!CONFIG_DRM_I915_PREEMPT_TIMEOUT)
2696 : : return false;
2697 : :
2698 [ # # # # : 0 : if (!timer_expired(t))
# # ]
2699 : : return false;
2700 : :
2701 [ # # ]: 0 : return READ_ONCE(engine->execlists.pending[0]);
2702 : : }
2703 : :
2704 : : /*
2705 : : * Check the unread Context Status Buffers and manage the submission of new
2706 : : * contexts to the ELSP accordingly.
2707 : : */
2708 : 0 : static void execlists_submission_tasklet(unsigned long data)
2709 : : {
2710 : 0 : struct intel_engine_cs * const engine = (struct intel_engine_cs *)data;
2711 [ # # ]: 0 : bool timeout = preempt_timeout(engine);
2712 : :
2713 : 0 : process_csb(engine);
2714 [ # # # # ]: 0 : if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {
2715 : 0 : unsigned long flags;
2716 : :
2717 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
2718 [ # # ]: 0 : __execlists_submission_tasklet(engine);
2719 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
2720 : :
2721 : : /* Recheck after serialising with direct-submission */
2722 [ # # # # ]: 0 : if (timeout && preempt_timeout(engine))
2723 : 0 : preempt_reset(engine);
2724 : : }
2725 : 0 : }
2726 : :
2727 : 0 : static void __execlists_kick(struct intel_engine_execlists *execlists)
2728 : : {
2729 : : /* Kick the tasklet for some interrupt coalescing and reset handling */
2730 : 0 : tasklet_hi_schedule(&execlists->tasklet);
2731 : : }
2732 : :
2733 : : #define execlists_kick(t, member) \
2734 : : __execlists_kick(container_of(t, struct intel_engine_execlists, member))
2735 : :
2736 : 0 : static void execlists_timeslice(struct timer_list *timer)
2737 : : {
2738 : 0 : execlists_kick(timer, timer);
2739 : 0 : }
2740 : :
2741 : 0 : static void execlists_preempt(struct timer_list *timer)
2742 : : {
2743 : 0 : execlists_kick(timer, preempt);
2744 : 0 : }
2745 : :
2746 : 0 : static void queue_request(struct intel_engine_cs *engine,
2747 : : struct i915_request *rq)
2748 : : {
2749 : 0 : GEM_BUG_ON(!list_empty(&rq->sched.link));
2750 : 0 : list_add_tail(&rq->sched.link,
2751 : : i915_sched_lookup_priolist(engine, rq_prio(rq)));
2752 : 0 : set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2753 : 0 : }
2754 : :
2755 : 0 : static void __submit_queue_imm(struct intel_engine_cs *engine)
2756 : : {
2757 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
2758 : :
2759 [ # # ]: 0 : if (reset_in_progress(execlists))
2760 : : return; /* defer until we restart the engine following reset */
2761 : :
2762 [ # # ]: 0 : if (execlists->tasklet.func == execlists_submission_tasklet)
2763 [ # # ]: 0 : __execlists_submission_tasklet(engine);
2764 : : else
2765 : 0 : tasklet_hi_schedule(&execlists->tasklet);
2766 : : }
2767 : :
2768 : 0 : static void submit_queue(struct intel_engine_cs *engine,
2769 : : const struct i915_request *rq)
2770 : : {
2771 : 0 : struct intel_engine_execlists *execlists = &engine->execlists;
2772 : :
2773 : 0 : if (rq_prio(rq) <= execlists->queue_priority_hint)
2774 : : return;
2775 : :
2776 : 0 : execlists->queue_priority_hint = rq_prio(rq);
2777 : 0 : __submit_queue_imm(engine);
2778 : : }
2779 : :
2780 : 0 : static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2781 : : const struct i915_request *rq)
2782 : : {
2783 : 0 : GEM_BUG_ON(i915_request_on_hold(rq));
2784 [ # # ]: 0 : return !list_empty(&engine->active.hold) && hold_request(rq);
2785 : : }
2786 : :
2787 : 0 : static void execlists_submit_request(struct i915_request *request)
2788 : : {
2789 : 0 : struct intel_engine_cs *engine = request->engine;
2790 : 0 : unsigned long flags;
2791 : :
2792 : : /* Will be called from irq-context when using foreign fences. */
2793 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
2794 : :
2795 [ # # # # ]: 0 : if (unlikely(ancestor_on_hold(engine, request))) {
2796 : 0 : list_add_tail(&request->sched.link, &engine->active.hold);
2797 : 0 : i915_request_set_hold(request);
2798 : : } else {
2799 : 0 : queue_request(engine, request);
2800 : :
2801 : 0 : GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2802 : 0 : GEM_BUG_ON(list_empty(&request->sched.link));
2803 : :
2804 [ # # ]: 0 : submit_queue(engine, request);
2805 : : }
2806 : :
2807 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
2808 : 0 : }
2809 : :
2810 : 0 : static void __execlists_context_fini(struct intel_context *ce)
2811 : : {
2812 : 0 : intel_ring_put(ce->ring);
2813 : 0 : i915_vma_put(ce->state);
2814 : 0 : }
2815 : :
2816 : 0 : static void execlists_context_destroy(struct kref *kref)
2817 : : {
2818 : 0 : struct intel_context *ce = container_of(kref, typeof(*ce), ref);
2819 : :
2820 : 0 : GEM_BUG_ON(!i915_active_is_idle(&ce->active));
2821 : 0 : GEM_BUG_ON(intel_context_is_pinned(ce));
2822 : :
2823 [ # # ]: 0 : if (ce->state)
2824 : 0 : __execlists_context_fini(ce);
2825 : :
2826 : 0 : intel_context_fini(ce);
2827 : 0 : intel_context_free(ce);
2828 : 0 : }
2829 : :
2830 : : static void
2831 : 0 : set_redzone(void *vaddr, const struct intel_engine_cs *engine)
2832 : : {
2833 : 0 : if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2834 : 0 : return;
2835 : :
2836 : : vaddr += engine->context_size;
2837 : :
2838 : : memset(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE);
2839 : : }
2840 : :
2841 : : static void
2842 : 0 : check_redzone(const void *vaddr, const struct intel_engine_cs *engine)
2843 : : {
2844 : 0 : if (!IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
2845 : 0 : return;
2846 : :
2847 : : vaddr += engine->context_size;
2848 : :
2849 : : if (memchr_inv(vaddr, CONTEXT_REDZONE, I915_GTT_PAGE_SIZE))
2850 : 0 : dev_err_once(engine->i915->drm.dev,
2851 : : "%s context redzone overwritten!\n",
2852 : : engine->name);
2853 : : }
2854 : :
2855 : 0 : static void execlists_context_unpin(struct intel_context *ce)
2856 : : {
2857 : 0 : check_redzone((void *)ce->lrc_reg_state - LRC_STATE_PN * PAGE_SIZE,
2858 : : ce->engine);
2859 : :
2860 : 0 : i915_gem_object_unpin_map(ce->state->obj);
2861 : 0 : }
2862 : :
2863 : : static void
2864 : 0 : __execlists_update_reg_state(const struct intel_context *ce,
2865 : : const struct intel_engine_cs *engine,
2866 : : u32 head)
2867 : : {
2868 : 0 : struct intel_ring *ring = ce->ring;
2869 : 0 : u32 *regs = ce->lrc_reg_state;
2870 : :
2871 : 0 : GEM_BUG_ON(!intel_ring_offset_valid(ring, head));
2872 : 0 : GEM_BUG_ON(!intel_ring_offset_valid(ring, ring->tail));
2873 : :
2874 [ # # ]: 0 : regs[CTX_RING_START] = i915_ggtt_offset(ring->vma);
2875 : 0 : regs[CTX_RING_HEAD] = head;
2876 : 0 : regs[CTX_RING_TAIL] = ring->tail;
2877 : :
2878 : : /* RPCS */
2879 [ # # ]: 0 : if (engine->class == RENDER_CLASS) {
2880 : 0 : regs[CTX_R_PWR_CLK_STATE] =
2881 : 0 : intel_sseu_make_rpcs(engine->i915, &ce->sseu);
2882 : :
2883 : 0 : i915_oa_init_reg_state(ce, engine);
2884 : : }
2885 : 0 : }
2886 : :
2887 : : static int
2888 : 0 : __execlists_context_pin(struct intel_context *ce,
2889 : : struct intel_engine_cs *engine)
2890 : : {
2891 : 0 : void *vaddr;
2892 : :
2893 : 0 : GEM_BUG_ON(!ce->state);
2894 : 0 : GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
2895 : :
2896 : 0 : vaddr = i915_gem_object_pin_map(ce->state->obj,
2897 : 0 : i915_coherent_map_type(engine->i915) |
2898 : : I915_MAP_OVERRIDE);
2899 [ # # ]: 0 : if (IS_ERR(vaddr))
2900 : 0 : return PTR_ERR(vaddr);
2901 : :
2902 : 0 : ce->lrc_desc = lrc_descriptor(ce, engine) | CTX_DESC_FORCE_RESTORE;
2903 : 0 : ce->lrc_reg_state = vaddr + LRC_STATE_PN * PAGE_SIZE;
2904 : 0 : __execlists_update_reg_state(ce, engine, ce->ring->tail);
2905 : :
2906 : 0 : return 0;
2907 : : }
2908 : :
2909 : 0 : static int execlists_context_pin(struct intel_context *ce)
2910 : : {
2911 : 0 : return __execlists_context_pin(ce, ce->engine);
2912 : : }
2913 : :
2914 : 0 : static int execlists_context_alloc(struct intel_context *ce)
2915 : : {
2916 : 0 : return __execlists_context_alloc(ce, ce->engine);
2917 : : }
2918 : :
2919 : 0 : static void execlists_context_reset(struct intel_context *ce)
2920 : : {
2921 : 0 : CE_TRACE(ce, "reset\n");
2922 : 0 : GEM_BUG_ON(!intel_context_is_pinned(ce));
2923 : :
2924 : : /*
2925 : : * Because we emit WA_TAIL_DWORDS there may be a disparity
2926 : : * between our bookkeeping in ce->ring->head and ce->ring->tail and
2927 : : * that stored in context. As we only write new commands from
2928 : : * ce->ring->tail onwards, everything before that is junk. If the GPU
2929 : : * starts reading from its RING_HEAD from the context, it may try to
2930 : : * execute that junk and die.
2931 : : *
2932 : : * The contexts that are stilled pinned on resume belong to the
2933 : : * kernel, and are local to each engine. All other contexts will
2934 : : * have their head/tail sanitized upon pinning before use, so they
2935 : : * will never see garbage,
2936 : : *
2937 : : * So to avoid that we reset the context images upon resume. For
2938 : : * simplicity, we just zero everything out.
2939 : : */
2940 : 0 : intel_ring_reset(ce->ring, ce->ring->emit);
2941 : :
2942 : : /* Scrub away the garbage */
2943 : 0 : execlists_init_reg_state(ce->lrc_reg_state,
2944 : 0 : ce, ce->engine, ce->ring, true);
2945 : 0 : __execlists_update_reg_state(ce, ce->engine, ce->ring->tail);
2946 : :
2947 : 0 : ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
2948 : 0 : }
2949 : :
2950 : : static const struct intel_context_ops execlists_context_ops = {
2951 : : .alloc = execlists_context_alloc,
2952 : :
2953 : : .pin = execlists_context_pin,
2954 : : .unpin = execlists_context_unpin,
2955 : :
2956 : : .enter = intel_context_enter_engine,
2957 : : .exit = intel_context_exit_engine,
2958 : :
2959 : : .reset = execlists_context_reset,
2960 : : .destroy = execlists_context_destroy,
2961 : : };
2962 : :
2963 : 0 : static int gen8_emit_init_breadcrumb(struct i915_request *rq)
2964 : : {
2965 : 0 : u32 *cs;
2966 : :
2967 : 0 : GEM_BUG_ON(!i915_request_timeline(rq)->has_initial_breadcrumb);
2968 : :
2969 : 0 : cs = intel_ring_begin(rq, 6);
2970 [ # # ]: 0 : if (IS_ERR(cs))
2971 : 0 : return PTR_ERR(cs);
2972 : :
2973 : : /*
2974 : : * Check if we have been preempted before we even get started.
2975 : : *
2976 : : * After this point i915_request_started() reports true, even if
2977 : : * we get preempted and so are no longer running.
2978 : : */
2979 : 0 : *cs++ = MI_ARB_CHECK;
2980 : 0 : *cs++ = MI_NOOP;
2981 : :
2982 : 0 : *cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
2983 : 0 : *cs++ = i915_request_timeline(rq)->hwsp_offset;
2984 : 0 : *cs++ = 0;
2985 : 0 : *cs++ = rq->fence.seqno - 1;
2986 : :
2987 : 0 : intel_ring_advance(rq, cs);
2988 : :
2989 : : /* Record the updated position of the request's payload */
2990 : 0 : rq->infix = intel_ring_offset(rq, cs);
2991 : :
2992 : 0 : return 0;
2993 : : }
2994 : :
2995 : 0 : static int execlists_request_alloc(struct i915_request *request)
2996 : : {
2997 : 0 : int ret;
2998 : :
2999 : 0 : GEM_BUG_ON(!intel_context_is_pinned(request->context));
3000 : :
3001 : : /*
3002 : : * Flush enough space to reduce the likelihood of waiting after
3003 : : * we start building the request - in which case we will just
3004 : : * have to repeat work.
3005 : : */
3006 : 0 : request->reserved_space += EXECLISTS_REQUEST_SIZE;
3007 : :
3008 : : /*
3009 : : * Note that after this point, we have committed to using
3010 : : * this request as it is being used to both track the
3011 : : * state of engine initialisation and liveness of the
3012 : : * golden renderstate above. Think twice before you try
3013 : : * to cancel/unwind this request now.
3014 : : */
3015 : :
3016 : : /* Unconditionally invalidate GPU caches and TLBs. */
3017 : 0 : ret = request->engine->emit_flush(request, EMIT_INVALIDATE);
3018 [ # # ]: 0 : if (ret)
3019 : : return ret;
3020 : :
3021 : 0 : request->reserved_space -= EXECLISTS_REQUEST_SIZE;
3022 : 0 : return 0;
3023 : : }
3024 : :
3025 : : /*
3026 : : * In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
3027 : : * PIPE_CONTROL instruction. This is required for the flush to happen correctly
3028 : : * but there is a slight complication as this is applied in WA batch where the
3029 : : * values are only initialized once so we cannot take register value at the
3030 : : * beginning and reuse it further; hence we save its value to memory, upload a
3031 : : * constant value with bit21 set and then we restore it back with the saved value.
3032 : : * To simplify the WA, a constant value is formed by using the default value
3033 : : * of this register. This shouldn't be a problem because we are only modifying
3034 : : * it for a short period and this batch in non-premptible. We can ofcourse
3035 : : * use additional instructions that read the actual value of the register
3036 : : * at that time and set our bit of interest but it makes the WA complicated.
3037 : : *
3038 : : * This WA is also required for Gen9 so extracting as a function avoids
3039 : : * code duplication.
3040 : : */
3041 : : static u32 *
3042 : : gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *engine, u32 *batch)
3043 : : {
3044 : : /* NB no one else is allowed to scribble over scratch + 256! */
3045 : : *batch++ = MI_STORE_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3046 : : *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3047 : : *batch++ = intel_gt_scratch_offset(engine->gt,
3048 : : INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3049 : : *batch++ = 0;
3050 : :
3051 : : *batch++ = MI_LOAD_REGISTER_IMM(1);
3052 : : *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3053 : : *batch++ = 0x40400000 | GEN8_LQSC_FLUSH_COHERENT_LINES;
3054 : :
3055 : : batch = gen8_emit_pipe_control(batch,
3056 : : PIPE_CONTROL_CS_STALL |
3057 : : PIPE_CONTROL_DC_FLUSH_ENABLE,
3058 : : 0);
3059 : :
3060 : : *batch++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_SRM_LRM_GLOBAL_GTT;
3061 : : *batch++ = i915_mmio_reg_offset(GEN8_L3SQCREG4);
3062 : : *batch++ = intel_gt_scratch_offset(engine->gt,
3063 : : INTEL_GT_SCRATCH_FIELD_COHERENTL3_WA);
3064 : : *batch++ = 0;
3065 : :
3066 : : return batch;
3067 : : }
3068 : :
3069 : : /*
3070 : : * Typically we only have one indirect_ctx and per_ctx batch buffer which are
3071 : : * initialized at the beginning and shared across all contexts but this field
3072 : : * helps us to have multiple batches at different offsets and select them based
3073 : : * on a criteria. At the moment this batch always start at the beginning of the page
3074 : : * and at this point we don't have multiple wa_ctx batch buffers.
3075 : : *
3076 : : * The number of WA applied are not known at the beginning; we use this field
3077 : : * to return the no of DWORDS written.
3078 : : *
3079 : : * It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
3080 : : * so it adds NOOPs as padding to make it cacheline aligned.
3081 : : * MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
3082 : : * makes a complete batch buffer.
3083 : : */
3084 : 0 : static u32 *gen8_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3085 : : {
3086 : : /* WaDisableCtxRestoreArbitration:bdw,chv */
3087 : 0 : *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3088 : :
3089 : : /* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
3090 [ # # ]: 0 : if (IS_BROADWELL(engine->i915))
3091 : 0 : batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3092 : :
3093 : : /* WaClearSlmSpaceAtContextSwitch:bdw,chv */
3094 : : /* Actual scratch location is at 128 bytes offset */
3095 : 0 : batch = gen8_emit_pipe_control(batch,
3096 : : PIPE_CONTROL_FLUSH_L3 |
3097 : : PIPE_CONTROL_STORE_DATA_INDEX |
3098 : : PIPE_CONTROL_CS_STALL |
3099 : : PIPE_CONTROL_QW_WRITE,
3100 : : LRC_PPHWSP_SCRATCH_ADDR);
3101 : :
3102 : 0 : *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3103 : :
3104 : : /* Pad to end of cacheline */
3105 [ # # ]: 0 : while ((unsigned long)batch % CACHELINE_BYTES)
3106 : 0 : *batch++ = MI_NOOP;
3107 : :
3108 : : /*
3109 : : * MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
3110 : : * execution depends on the length specified in terms of cache lines
3111 : : * in the register CTX_RCS_INDIRECT_CTX
3112 : : */
3113 : :
3114 : 0 : return batch;
3115 : : }
3116 : :
3117 : : struct lri {
3118 : : i915_reg_t reg;
3119 : : u32 value;
3120 : : };
3121 : :
3122 : 0 : static u32 *emit_lri(u32 *batch, const struct lri *lri, unsigned int count)
3123 : : {
3124 : 0 : GEM_BUG_ON(!count || count > 63);
3125 : :
3126 : 0 : *batch++ = MI_LOAD_REGISTER_IMM(count);
3127 : 0 : do {
3128 [ # # ]: 0 : *batch++ = i915_mmio_reg_offset(lri->reg);
3129 : 0 : *batch++ = lri->value;
3130 [ # # ]: 0 : } while (lri++, --count);
3131 : 0 : *batch++ = MI_NOOP;
3132 : :
3133 : 0 : return batch;
3134 : : }
3135 : :
3136 : 0 : static u32 *gen9_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3137 : : {
3138 : 0 : static const struct lri lri[] = {
3139 : : /* WaDisableGatherAtSetShaderCommonSlice:skl,bxt,kbl,glk */
3140 : : {
3141 : : COMMON_SLICE_CHICKEN2,
3142 : : __MASKED_FIELD(GEN9_DISABLE_GATHER_AT_SET_SHADER_COMMON_SLICE,
3143 : : 0),
3144 : : },
3145 : :
3146 : : /* BSpec: 11391 */
3147 : : {
3148 : : FF_SLICE_CHICKEN,
3149 : : __MASKED_FIELD(FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX,
3150 : : FF_SLICE_CHICKEN_CL_PROVOKING_VERTEX_FIX),
3151 : : },
3152 : :
3153 : : /* BSpec: 11299 */
3154 : : {
3155 : : _3D_CHICKEN3,
3156 : : __MASKED_FIELD(_3D_CHICKEN_SF_PROVOKING_VERTEX_FIX,
3157 : : _3D_CHICKEN_SF_PROVOKING_VERTEX_FIX),
3158 : : }
3159 : : };
3160 : :
3161 : 0 : *batch++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3162 : :
3163 : : /* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt,glk */
3164 : 0 : batch = gen8_emit_flush_coherentl3_wa(engine, batch);
3165 : :
3166 : : /* WaClearSlmSpaceAtContextSwitch:skl,bxt,kbl,glk,cfl */
3167 : 0 : batch = gen8_emit_pipe_control(batch,
3168 : : PIPE_CONTROL_FLUSH_L3 |
3169 : : PIPE_CONTROL_STORE_DATA_INDEX |
3170 : : PIPE_CONTROL_CS_STALL |
3171 : : PIPE_CONTROL_QW_WRITE,
3172 : : LRC_PPHWSP_SCRATCH_ADDR);
3173 : :
3174 : 0 : batch = emit_lri(batch, lri, ARRAY_SIZE(lri));
3175 : :
3176 : : /* WaMediaPoolStateCmdInWABB:bxt,glk */
3177 [ # # ]: 0 : if (HAS_POOLED_EU(engine->i915)) {
3178 : : /*
3179 : : * EU pool configuration is setup along with golden context
3180 : : * during context initialization. This value depends on
3181 : : * device type (2x6 or 3x6) and needs to be updated based
3182 : : * on which subslice is disabled especially for 2x6
3183 : : * devices, however it is safe to load default
3184 : : * configuration of 3x6 device instead of masking off
3185 : : * corresponding bits because HW ignores bits of a disabled
3186 : : * subslice and drops down to appropriate config. Please
3187 : : * see render_state_setup() in i915_gem_render_state.c for
3188 : : * possible configurations, to avoid duplication they are
3189 : : * not shown here again.
3190 : : */
3191 : 0 : *batch++ = GEN9_MEDIA_POOL_STATE;
3192 : 0 : *batch++ = GEN9_MEDIA_POOL_ENABLE;
3193 : 0 : *batch++ = 0x00777000;
3194 : 0 : *batch++ = 0;
3195 : 0 : *batch++ = 0;
3196 : 0 : *batch++ = 0;
3197 : : }
3198 : :
3199 : 0 : *batch++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3200 : :
3201 : : /* Pad to end of cacheline */
3202 [ # # ]: 0 : while ((unsigned long)batch % CACHELINE_BYTES)
3203 : 0 : *batch++ = MI_NOOP;
3204 : :
3205 : 0 : return batch;
3206 : : }
3207 : :
3208 : : static u32 *
3209 : 0 : gen10_init_indirectctx_bb(struct intel_engine_cs *engine, u32 *batch)
3210 : : {
3211 : 0 : int i;
3212 : :
3213 : : /*
3214 : : * WaPipeControlBefore3DStateSamplePattern: cnl
3215 : : *
3216 : : * Ensure the engine is idle prior to programming a
3217 : : * 3DSTATE_SAMPLE_PATTERN during a context restore.
3218 : : */
3219 : 0 : batch = gen8_emit_pipe_control(batch,
3220 : : PIPE_CONTROL_CS_STALL,
3221 : : 0);
3222 : : /*
3223 : : * WaPipeControlBefore3DStateSamplePattern says we need 4 dwords for
3224 : : * the PIPE_CONTROL followed by 12 dwords of 0x0, so 16 dwords in
3225 : : * total. However, a PIPE_CONTROL is 6 dwords long, not 4, which is
3226 : : * confusing. Since gen8_emit_pipe_control() already advances the
3227 : : * batch by 6 dwords, we advance the other 10 here, completing a
3228 : : * cacheline. It's not clear if the workaround requires this padding
3229 : : * before other commands, or if it's just the regular padding we would
3230 : : * already have for the workaround bb, so leave it here for now.
3231 : : */
3232 [ # # ]: 0 : for (i = 0; i < 10; i++)
3233 : 0 : *batch++ = MI_NOOP;
3234 : :
3235 : : /* Pad to end of cacheline */
3236 [ # # ]: 0 : while ((unsigned long)batch % CACHELINE_BYTES)
3237 : 0 : *batch++ = MI_NOOP;
3238 : :
3239 : 0 : return batch;
3240 : : }
3241 : :
3242 : : #define CTX_WA_BB_OBJ_SIZE (PAGE_SIZE)
3243 : :
3244 : 0 : static int lrc_setup_wa_ctx(struct intel_engine_cs *engine)
3245 : : {
3246 : 0 : struct drm_i915_gem_object *obj;
3247 : 0 : struct i915_vma *vma;
3248 : 0 : int err;
3249 : :
3250 : 0 : obj = i915_gem_object_create_shmem(engine->i915, CTX_WA_BB_OBJ_SIZE);
3251 [ # # ]: 0 : if (IS_ERR(obj))
3252 : 0 : return PTR_ERR(obj);
3253 : :
3254 : 0 : vma = i915_vma_instance(obj, &engine->gt->ggtt->vm, NULL);
3255 [ # # ]: 0 : if (IS_ERR(vma)) {
3256 : 0 : err = PTR_ERR(vma);
3257 : 0 : goto err;
3258 : : }
3259 : :
3260 : 0 : err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
3261 [ # # ]: 0 : if (err)
3262 : 0 : goto err;
3263 : :
3264 : 0 : engine->wa_ctx.vma = vma;
3265 : 0 : return 0;
3266 : :
3267 : 0 : err:
3268 : 0 : i915_gem_object_put(obj);
3269 : 0 : return err;
3270 : : }
3271 : :
3272 : 0 : static void lrc_destroy_wa_ctx(struct intel_engine_cs *engine)
3273 : : {
3274 : 0 : i915_vma_unpin_and_release(&engine->wa_ctx.vma, 0);
3275 : : }
3276 : :
3277 : : typedef u32 *(*wa_bb_func_t)(struct intel_engine_cs *engine, u32 *batch);
3278 : :
3279 : 0 : static int intel_init_workaround_bb(struct intel_engine_cs *engine)
3280 : : {
3281 : 0 : struct i915_ctx_workarounds *wa_ctx = &engine->wa_ctx;
3282 : 0 : struct i915_wa_ctx_bb *wa_bb[2] = { &wa_ctx->indirect_ctx,
3283 : 0 : &wa_ctx->per_ctx };
3284 : 0 : wa_bb_func_t wa_bb_fn[2];
3285 : 0 : struct page *page;
3286 : 0 : void *batch, *batch_ptr;
3287 : 0 : unsigned int i;
3288 : 0 : int ret;
3289 : :
3290 [ # # ]: 0 : if (engine->class != RENDER_CLASS)
3291 : : return 0;
3292 : :
3293 [ # # # # : 0 : switch (INTEL_GEN(engine->i915)) {
# ]
3294 : : case 12:
3295 : : case 11:
3296 : : return 0;
3297 : 0 : case 10:
3298 : 0 : wa_bb_fn[0] = gen10_init_indirectctx_bb;
3299 : 0 : wa_bb_fn[1] = NULL;
3300 : 0 : break;
3301 : 0 : case 9:
3302 : 0 : wa_bb_fn[0] = gen9_init_indirectctx_bb;
3303 : 0 : wa_bb_fn[1] = NULL;
3304 : 0 : break;
3305 : 0 : case 8:
3306 : 0 : wa_bb_fn[0] = gen8_init_indirectctx_bb;
3307 : 0 : wa_bb_fn[1] = NULL;
3308 : 0 : break;
3309 : : default:
3310 : 0 : MISSING_CASE(INTEL_GEN(engine->i915));
3311 : 0 : return 0;
3312 : : }
3313 : :
3314 : 0 : ret = lrc_setup_wa_ctx(engine);
3315 [ # # ]: 0 : if (ret) {
3316 : 0 : DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
3317 : 0 : return ret;
3318 : : }
3319 : :
3320 : 0 : page = i915_gem_object_get_dirty_page(wa_ctx->vma->obj, 0);
3321 : 0 : batch = batch_ptr = kmap_atomic(page);
3322 : :
3323 : : /*
3324 : : * Emit the two workaround batch buffers, recording the offset from the
3325 : : * start of the workaround batch buffer object for each and their
3326 : : * respective sizes.
3327 : : */
3328 [ # # ]: 0 : for (i = 0; i < ARRAY_SIZE(wa_bb_fn); i++) {
3329 : 0 : wa_bb[i]->offset = batch_ptr - batch;
3330 : 0 : if (GEM_DEBUG_WARN_ON(!IS_ALIGNED(wa_bb[i]->offset,
3331 : : CACHELINE_BYTES))) {
3332 : : ret = -EINVAL;
3333 : : break;
3334 : : }
3335 [ # # ]: 0 : if (wa_bb_fn[i])
3336 : 0 : batch_ptr = wa_bb_fn[i](engine, batch_ptr);
3337 : 0 : wa_bb[i]->size = batch_ptr - (batch + wa_bb[i]->offset);
3338 : : }
3339 : :
3340 [ # # ]: 0 : BUG_ON(batch_ptr - batch > CTX_WA_BB_OBJ_SIZE);
3341 : :
3342 : 0 : kunmap_atomic(batch);
3343 : 0 : if (ret)
3344 : : lrc_destroy_wa_ctx(engine);
3345 : :
3346 : 0 : return ret;
3347 : : }
3348 : :
3349 : 0 : static void enable_execlists(struct intel_engine_cs *engine)
3350 : : {
3351 : 0 : u32 mode;
3352 : :
3353 : 0 : assert_forcewakes_active(engine->uncore, FORCEWAKE_ALL);
3354 : :
3355 : 0 : intel_engine_set_hwsp_writemask(engine, ~0u); /* HWSTAM */
3356 : :
3357 [ # # ]: 0 : if (INTEL_GEN(engine->i915) >= 11)
3358 : : mode = _MASKED_BIT_ENABLE(GEN11_GFX_DISABLE_LEGACY_MODE);
3359 : : else
3360 : : mode = _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE);
3361 : 0 : ENGINE_WRITE_FW(engine, RING_MODE_GEN7, mode);
3362 : :
3363 : 0 : ENGINE_WRITE_FW(engine, RING_MI_MODE, _MASKED_BIT_DISABLE(STOP_RING));
3364 : :
3365 : 0 : ENGINE_WRITE_FW(engine,
3366 : : RING_HWS_PGA,
3367 : : i915_ggtt_offset(engine->status_page.vma));
3368 : 0 : ENGINE_POSTING_READ(engine, RING_HWS_PGA);
3369 : :
3370 : 0 : engine->context_tag = 0;
3371 : 0 : }
3372 : :
3373 : : static bool unexpected_starting_state(struct intel_engine_cs *engine)
3374 : : {
3375 : : bool unexpected = false;
3376 : :
3377 : : if (ENGINE_READ_FW(engine, RING_MI_MODE) & STOP_RING) {
3378 : : DRM_DEBUG_DRIVER("STOP_RING still set in RING_MI_MODE\n");
3379 : : unexpected = true;
3380 : : }
3381 : :
3382 : : return unexpected;
3383 : : }
3384 : :
3385 : 0 : static int execlists_resume(struct intel_engine_cs *engine)
3386 : : {
3387 : 0 : intel_engine_apply_workarounds(engine);
3388 : 0 : intel_engine_apply_whitelist(engine);
3389 : :
3390 : 0 : intel_mocs_init_engine(engine);
3391 : :
3392 : 0 : intel_engine_reset_breadcrumbs(engine);
3393 : :
3394 : 0 : if (GEM_SHOW_DEBUG() && unexpected_starting_state(engine)) {
3395 : : struct drm_printer p = drm_debug_printer(__func__);
3396 : :
3397 : : intel_engine_dump(engine, &p, NULL);
3398 : : }
3399 : :
3400 : 0 : enable_execlists(engine);
3401 : :
3402 : 0 : return 0;
3403 : : }
3404 : :
3405 : 0 : static void execlists_reset_prepare(struct intel_engine_cs *engine)
3406 : : {
3407 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
3408 : 0 : unsigned long flags;
3409 : :
3410 : 0 : ENGINE_TRACE(engine, "depth<-%d\n",
3411 : : atomic_read(&execlists->tasklet.count));
3412 : :
3413 : : /*
3414 : : * Prevent request submission to the hardware until we have
3415 : : * completed the reset in i915_gem_reset_finish(). If a request
3416 : : * is completed by one engine, it may then queue a request
3417 : : * to a second via its execlists->tasklet *just* as we are
3418 : : * calling engine->resume() and also writing the ELSP.
3419 : : * Turning off the execlists->tasklet until the reset is over
3420 : : * prevents the race.
3421 : : */
3422 : 0 : __tasklet_disable_sync_once(&execlists->tasklet);
3423 : 0 : GEM_BUG_ON(!reset_in_progress(execlists));
3424 : :
3425 : : /* And flush any current direct submission. */
3426 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
3427 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
3428 : :
3429 : : /*
3430 : : * We stop engines, otherwise we might get failed reset and a
3431 : : * dead gpu (on elk). Also as modern gpu as kbl can suffer
3432 : : * from system hang if batchbuffer is progressing when
3433 : : * the reset is issued, regardless of READY_TO_RESET ack.
3434 : : * Thus assume it is best to stop engines on all gens
3435 : : * where we have a gpu reset.
3436 : : *
3437 : : * WaKBLVECSSemaphoreWaitPoll:kbl (on ALL_ENGINES)
3438 : : *
3439 : : * FIXME: Wa for more modern gens needs to be validated
3440 : : */
3441 : 0 : intel_engine_stop_cs(engine);
3442 : 0 : }
3443 : :
3444 : 0 : static void reset_csb_pointers(struct intel_engine_cs *engine)
3445 : : {
3446 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
3447 : 0 : const unsigned int reset_value = execlists->csb_size - 1;
3448 : :
3449 : 0 : ring_set_paused(engine, 0);
3450 : :
3451 : : /*
3452 : : * After a reset, the HW starts writing into CSB entry [0]. We
3453 : : * therefore have to set our HEAD pointer back one entry so that
3454 : : * the *first* entry we check is entry 0. To complicate this further,
3455 : : * as we don't wait for the first interrupt after reset, we have to
3456 : : * fake the HW write to point back to the last entry so that our
3457 : : * inline comparison of our cached head position against the last HW
3458 : : * write works even before the first interrupt.
3459 : : */
3460 : 0 : execlists->csb_head = reset_value;
3461 : 0 : WRITE_ONCE(*execlists->csb_write, reset_value);
3462 : 0 : wmb(); /* Make sure this is visible to HW (paranoia?) */
3463 : :
3464 : : /*
3465 : : * Sometimes Icelake forgets to reset its pointers on a GPU reset.
3466 : : * Bludgeon them with a mmio update to be sure.
3467 : : */
3468 : 0 : ENGINE_WRITE(engine, RING_CONTEXT_STATUS_PTR,
3469 : : reset_value << 8 | reset_value);
3470 : 0 : ENGINE_POSTING_READ(engine, RING_CONTEXT_STATUS_PTR);
3471 : :
3472 : 0 : invalidate_csb_entries(&execlists->csb_status[0],
3473 : 0 : &execlists->csb_status[reset_value]);
3474 : 0 : }
3475 : :
3476 : 0 : static void __reset_stop_ring(u32 *regs, const struct intel_engine_cs *engine)
3477 : : {
3478 : 0 : int x;
3479 : :
3480 : 0 : x = lrc_ring_mi_mode(engine);
3481 : : if (x != -1) {
3482 : 0 : regs[x + 1] &= ~STOP_RING;
3483 : 0 : regs[x + 1] |= STOP_RING << 16;
3484 : : }
3485 : : }
3486 : :
3487 : 0 : static void __execlists_reset_reg_state(const struct intel_context *ce,
3488 : : const struct intel_engine_cs *engine)
3489 : : {
3490 : 0 : u32 *regs = ce->lrc_reg_state;
3491 : :
3492 : 0 : __reset_stop_ring(regs, engine);
3493 : : }
3494 : :
3495 : 0 : static void __execlists_reset(struct intel_engine_cs *engine, bool stalled)
3496 : : {
3497 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
3498 : 0 : struct intel_context *ce;
3499 : 0 : struct i915_request *rq;
3500 : 0 : u32 head;
3501 : :
3502 : 0 : mb(); /* paranoia: read the CSB pointers from after the reset */
3503 : 0 : clflush(execlists->csb_write);
3504 : 0 : mb();
3505 : :
3506 : 0 : process_csb(engine); /* drain preemption events */
3507 : :
3508 : : /* Following the reset, we need to reload the CSB read/write pointers */
3509 : 0 : reset_csb_pointers(engine);
3510 : :
3511 : : /*
3512 : : * Save the currently executing context, even if we completed
3513 : : * its request, it was still running at the time of the
3514 : : * reset and will have been clobbered.
3515 : : */
3516 [ # # ]: 0 : rq = execlists_active(execlists);
3517 [ # # ]: 0 : if (!rq)
3518 : 0 : goto unwind;
3519 : :
3520 : : /* We still have requests in-flight; the engine should be active */
3521 : 0 : GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
3522 : :
3523 : 0 : ce = rq->context;
3524 : 0 : GEM_BUG_ON(!i915_vma_is_pinned(ce->state));
3525 : :
3526 [ # # ]: 0 : if (i915_request_completed(rq)) {
3527 : : /* Idle context; tidy up the ring so we can restart afresh */
3528 : 0 : head = intel_ring_wrap(ce->ring, rq->tail);
3529 : 0 : goto out_replay;
3530 : : }
3531 : :
3532 : : /* Context has requests still in-flight; it should not be idle! */
3533 : 0 : GEM_BUG_ON(i915_active_is_idle(&ce->active));
3534 : 0 : rq = active_request(ce->timeline, rq);
3535 : 0 : head = intel_ring_wrap(ce->ring, rq->head);
3536 : 0 : GEM_BUG_ON(head == ce->ring->tail);
3537 : :
3538 : : /*
3539 : : * If this request hasn't started yet, e.g. it is waiting on a
3540 : : * semaphore, we need to avoid skipping the request or else we
3541 : : * break the signaling chain. However, if the context is corrupt
3542 : : * the request will not restart and we will be stuck with a wedged
3543 : : * device. It is quite often the case that if we issue a reset
3544 : : * while the GPU is loading the context image, that the context
3545 : : * image becomes corrupt.
3546 : : *
3547 : : * Otherwise, if we have not started yet, the request should replay
3548 : : * perfectly and we do not need to flag the result as being erroneous.
3549 : : */
3550 [ # # ]: 0 : if (!i915_request_started(rq))
3551 : 0 : goto out_replay;
3552 : :
3553 : : /*
3554 : : * If the request was innocent, we leave the request in the ELSP
3555 : : * and will try to replay it on restarting. The context image may
3556 : : * have been corrupted by the reset, in which case we may have
3557 : : * to service a new GPU hang, but more likely we can continue on
3558 : : * without impact.
3559 : : *
3560 : : * If the request was guilty, we presume the context is corrupt
3561 : : * and have to at least restore the RING register in the context
3562 : : * image back to the expected values to skip over the guilty request.
3563 : : */
3564 : 0 : __i915_request_reset(rq, stalled);
3565 [ # # ]: 0 : if (!stalled)
3566 : 0 : goto out_replay;
3567 : :
3568 : : /*
3569 : : * We want a simple context + ring to execute the breadcrumb update.
3570 : : * We cannot rely on the context being intact across the GPU hang,
3571 : : * so clear it and rebuild just what we need for the breadcrumb.
3572 : : * All pending requests for this context will be zapped, and any
3573 : : * future request will be after userspace has had the opportunity
3574 : : * to recreate its own state.
3575 : : */
3576 : 0 : GEM_BUG_ON(!intel_context_is_pinned(ce));
3577 : 0 : restore_default_state(ce, engine);
3578 : :
3579 : 0 : out_replay:
3580 : 0 : ENGINE_TRACE(engine, "replay {head:%04x, tail:%04x}\n",
3581 : : head, ce->ring->tail);
3582 [ # # ]: 0 : __execlists_reset_reg_state(ce, engine);
3583 : 0 : __execlists_update_reg_state(ce, engine, head);
3584 : 0 : ce->lrc_desc |= CTX_DESC_FORCE_RESTORE; /* paranoid: GPU was reset! */
3585 : :
3586 : 0 : unwind:
3587 : : /* Push back any incomplete requests for replay after the reset. */
3588 : 0 : cancel_port_requests(execlists);
3589 : 0 : __unwind_incomplete_requests(engine);
3590 : 0 : }
3591 : :
3592 : 0 : static void execlists_reset_rewind(struct intel_engine_cs *engine, bool stalled)
3593 : : {
3594 : 0 : unsigned long flags;
3595 : :
3596 : 0 : ENGINE_TRACE(engine, "\n");
3597 : :
3598 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
3599 : :
3600 : 0 : __execlists_reset(engine, stalled);
3601 : :
3602 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
3603 : 0 : }
3604 : :
3605 : 0 : static void nop_submission_tasklet(unsigned long data)
3606 : : {
3607 : : /* The driver is wedged; don't process any more events. */
3608 : 0 : }
3609 : :
3610 : 0 : static void execlists_reset_cancel(struct intel_engine_cs *engine)
3611 : : {
3612 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
3613 : 0 : struct i915_request *rq, *rn;
3614 : 0 : struct rb_node *rb;
3615 : 0 : unsigned long flags;
3616 : :
3617 : 0 : ENGINE_TRACE(engine, "\n");
3618 : :
3619 : : /*
3620 : : * Before we call engine->cancel_requests(), we should have exclusive
3621 : : * access to the submission state. This is arranged for us by the
3622 : : * caller disabling the interrupt generation, the tasklet and other
3623 : : * threads that may then access the same state, giving us a free hand
3624 : : * to reset state. However, we still need to let lockdep be aware that
3625 : : * we know this state may be accessed in hardirq context, so we
3626 : : * disable the irq around this manipulation and we want to keep
3627 : : * the spinlock focused on its duties and not accidentally conflate
3628 : : * coverage to the submission's irq state. (Similarly, although we
3629 : : * shouldn't need to disable irq around the manipulation of the
3630 : : * submission's irq state, we also wish to remind ourselves that
3631 : : * it is irq state.)
3632 : : */
3633 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
3634 : :
3635 : 0 : __execlists_reset(engine, true);
3636 : :
3637 : : /* Mark all executing requests as skipped. */
3638 [ # # ]: 0 : list_for_each_entry(rq, &engine->active.requests, sched.link)
3639 : 0 : mark_eio(rq);
3640 : :
3641 : : /* Flush the queued requests to the timeline list (for retiring). */
3642 [ # # ]: 0 : while ((rb = rb_first_cached(&execlists->queue))) {
3643 : 0 : struct i915_priolist *p = to_priolist(rb);
3644 : 0 : int i;
3645 : :
3646 [ # # # # ]: 0 : priolist_for_each_request_consume(rq, rn, p, i) {
3647 : 0 : mark_eio(rq);
3648 : 0 : __i915_request_submit(rq);
3649 : : }
3650 : :
3651 : 0 : rb_erase_cached(&p->node, &execlists->queue);
3652 [ # # ]: 0 : i915_priolist_free(p);
3653 : : }
3654 : :
3655 : : /* On-hold requests will be flushed to timeline upon their release */
3656 [ # # ]: 0 : list_for_each_entry(rq, &engine->active.hold, sched.link)
3657 : 0 : mark_eio(rq);
3658 : :
3659 : : /* Cancel all attached virtual engines */
3660 [ # # ]: 0 : while ((rb = rb_first_cached(&execlists->virtual))) {
3661 : 0 : struct virtual_engine *ve =
3662 : 0 : rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
3663 : :
3664 : 0 : rb_erase_cached(rb, &execlists->virtual);
3665 : 0 : RB_CLEAR_NODE(rb);
3666 : :
3667 : 0 : spin_lock(&ve->base.active.lock);
3668 : 0 : rq = fetch_and_zero(&ve->request);
3669 [ # # ]: 0 : if (rq) {
3670 : 0 : mark_eio(rq);
3671 : :
3672 : 0 : rq->engine = engine;
3673 : 0 : __i915_request_submit(rq);
3674 : 0 : i915_request_put(rq);
3675 : :
3676 : 0 : ve->base.execlists.queue_priority_hint = INT_MIN;
3677 : : }
3678 : 0 : spin_unlock(&ve->base.active.lock);
3679 : : }
3680 : :
3681 : : /* Remaining _unready_ requests will be nop'ed when submitted */
3682 : :
3683 : 0 : execlists->queue_priority_hint = INT_MIN;
3684 : 0 : execlists->queue = RB_ROOT_CACHED;
3685 : :
3686 : 0 : GEM_BUG_ON(__tasklet_is_enabled(&execlists->tasklet));
3687 : 0 : execlists->tasklet.func = nop_submission_tasklet;
3688 : :
3689 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
3690 : 0 : }
3691 : :
3692 : 0 : static void execlists_reset_finish(struct intel_engine_cs *engine)
3693 : : {
3694 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
3695 : :
3696 : : /*
3697 : : * After a GPU reset, we may have requests to replay. Do so now while
3698 : : * we still have the forcewake to be sure that the GPU is not allowed
3699 : : * to sleep before we restart and reload a context.
3700 : : */
3701 : 0 : GEM_BUG_ON(!reset_in_progress(execlists));
3702 [ # # ]: 0 : if (!RB_EMPTY_ROOT(&execlists->queue.rb_root))
3703 : 0 : execlists->tasklet.func(execlists->tasklet.data);
3704 : :
3705 [ # # ]: 0 : if (__tasklet_enable(&execlists->tasklet))
3706 : : /* And kick in case we missed a new request submission. */
3707 : 0 : tasklet_hi_schedule(&execlists->tasklet);
3708 : 0 : ENGINE_TRACE(engine, "depth->%d\n",
3709 : : atomic_read(&execlists->tasklet.count));
3710 : 0 : }
3711 : :
3712 : 0 : static int gen8_emit_bb_start_noarb(struct i915_request *rq,
3713 : : u64 offset, u32 len,
3714 : : const unsigned int flags)
3715 : : {
3716 : 0 : u32 *cs;
3717 : :
3718 : 0 : cs = intel_ring_begin(rq, 4);
3719 [ # # ]: 0 : if (IS_ERR(cs))
3720 : 0 : return PTR_ERR(cs);
3721 : :
3722 : : /*
3723 : : * WaDisableCtxRestoreArbitration:bdw,chv
3724 : : *
3725 : : * We don't need to perform MI_ARB_ENABLE as often as we do (in
3726 : : * particular all the gen that do not need the w/a at all!), if we
3727 : : * took care to make sure that on every switch into this context
3728 : : * (both ordinary and for preemption) that arbitrartion was enabled
3729 : : * we would be fine. However, for gen8 there is another w/a that
3730 : : * requires us to not preempt inside GPGPU execution, so we keep
3731 : : * arbitration disabled for gen8 batches. Arbitration will be
3732 : : * re-enabled before we close the request
3733 : : * (engine->emit_fini_breadcrumb).
3734 : : */
3735 : 0 : *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3736 : :
3737 : : /* FIXME(BDW+): Address space and security selectors. */
3738 [ # # ]: 0 : *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3739 : : (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3740 : 0 : *cs++ = lower_32_bits(offset);
3741 : 0 : *cs++ = upper_32_bits(offset);
3742 : :
3743 : 0 : intel_ring_advance(rq, cs);
3744 : :
3745 : 0 : return 0;
3746 : : }
3747 : :
3748 : 0 : static int gen8_emit_bb_start(struct i915_request *rq,
3749 : : u64 offset, u32 len,
3750 : : const unsigned int flags)
3751 : : {
3752 : 0 : u32 *cs;
3753 : :
3754 : 0 : cs = intel_ring_begin(rq, 6);
3755 [ # # ]: 0 : if (IS_ERR(cs))
3756 : 0 : return PTR_ERR(cs);
3757 : :
3758 : 0 : *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
3759 : :
3760 [ # # ]: 0 : *cs++ = MI_BATCH_BUFFER_START_GEN8 |
3761 : : (flags & I915_DISPATCH_SECURE ? 0 : BIT(8));
3762 : 0 : *cs++ = lower_32_bits(offset);
3763 : 0 : *cs++ = upper_32_bits(offset);
3764 : :
3765 : 0 : *cs++ = MI_ARB_ON_OFF | MI_ARB_DISABLE;
3766 : 0 : *cs++ = MI_NOOP;
3767 : :
3768 : 0 : intel_ring_advance(rq, cs);
3769 : :
3770 : 0 : return 0;
3771 : : }
3772 : :
3773 : 0 : static void gen8_logical_ring_enable_irq(struct intel_engine_cs *engine)
3774 : : {
3775 : 0 : ENGINE_WRITE(engine, RING_IMR,
3776 : : ~(engine->irq_enable_mask | engine->irq_keep_mask));
3777 : 0 : ENGINE_POSTING_READ(engine, RING_IMR);
3778 : 0 : }
3779 : :
3780 : 0 : static void gen8_logical_ring_disable_irq(struct intel_engine_cs *engine)
3781 : : {
3782 : 0 : ENGINE_WRITE(engine, RING_IMR, ~engine->irq_keep_mask);
3783 : 0 : }
3784 : :
3785 : 0 : static int gen8_emit_flush(struct i915_request *request, u32 mode)
3786 : : {
3787 : 0 : u32 cmd, *cs;
3788 : :
3789 : 0 : cs = intel_ring_begin(request, 4);
3790 [ # # ]: 0 : if (IS_ERR(cs))
3791 : 0 : return PTR_ERR(cs);
3792 : :
3793 : 0 : cmd = MI_FLUSH_DW + 1;
3794 : :
3795 : : /* We always require a command barrier so that subsequent
3796 : : * commands, such as breadcrumb interrupts, are strictly ordered
3797 : : * wrt the contents of the write cache being flushed to memory
3798 : : * (and thus being coherent from the CPU).
3799 : : */
3800 : 0 : cmd |= MI_FLUSH_DW_STORE_INDEX | MI_FLUSH_DW_OP_STOREDW;
3801 : :
3802 [ # # ]: 0 : if (mode & EMIT_INVALIDATE) {
3803 : 0 : cmd |= MI_INVALIDATE_TLB;
3804 [ # # ]: 0 : if (request->engine->class == VIDEO_DECODE_CLASS)
3805 : 0 : cmd |= MI_INVALIDATE_BSD;
3806 : : }
3807 : :
3808 : 0 : *cs++ = cmd;
3809 : 0 : *cs++ = LRC_PPHWSP_SCRATCH_ADDR;
3810 : 0 : *cs++ = 0; /* upper addr */
3811 : 0 : *cs++ = 0; /* value */
3812 : 0 : intel_ring_advance(request, cs);
3813 : :
3814 : 0 : return 0;
3815 : : }
3816 : :
3817 : 0 : static int gen8_emit_flush_render(struct i915_request *request,
3818 : : u32 mode)
3819 : : {
3820 : 0 : bool vf_flush_wa = false, dc_flush_wa = false;
3821 : 0 : u32 *cs, flags = 0;
3822 : 0 : int len;
3823 : :
3824 : 0 : flags |= PIPE_CONTROL_CS_STALL;
3825 : :
3826 [ # # ]: 0 : if (mode & EMIT_FLUSH) {
3827 : 0 : flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3828 : 0 : flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3829 : 0 : flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3830 : 0 : flags |= PIPE_CONTROL_FLUSH_ENABLE;
3831 : : }
3832 : :
3833 [ # # ]: 0 : if (mode & EMIT_INVALIDATE) {
3834 : 0 : flags |= PIPE_CONTROL_TLB_INVALIDATE;
3835 : 0 : flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3836 : 0 : flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3837 : 0 : flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3838 : 0 : flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3839 : 0 : flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3840 : 0 : flags |= PIPE_CONTROL_QW_WRITE;
3841 : 0 : flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3842 : :
3843 : : /*
3844 : : * On GEN9: before VF_CACHE_INVALIDATE we need to emit a NULL
3845 : : * pipe control.
3846 : : */
3847 [ # # ]: 0 : if (IS_GEN(request->i915, 9))
3848 : 0 : vf_flush_wa = true;
3849 : :
3850 : : /* WaForGAMHang:kbl */
3851 [ # # # # ]: 0 : if (IS_KBL_REVID(request->i915, 0, KBL_REVID_B0))
3852 : 0 : dc_flush_wa = true;
3853 : : }
3854 : :
3855 : 0 : len = 6;
3856 : :
3857 [ # # ]: 0 : if (vf_flush_wa)
3858 : 0 : len += 6;
3859 : :
3860 [ # # ]: 0 : if (dc_flush_wa)
3861 : 0 : len += 12;
3862 : :
3863 : 0 : cs = intel_ring_begin(request, len);
3864 [ # # ]: 0 : if (IS_ERR(cs))
3865 : 0 : return PTR_ERR(cs);
3866 : :
3867 [ # # ]: 0 : if (vf_flush_wa)
3868 : 0 : cs = gen8_emit_pipe_control(cs, 0, 0);
3869 : :
3870 [ # # ]: 0 : if (dc_flush_wa)
3871 : 0 : cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_DC_FLUSH_ENABLE,
3872 : : 0);
3873 : :
3874 [ # # ]: 0 : cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3875 : :
3876 [ # # ]: 0 : if (dc_flush_wa)
3877 : 0 : cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_CS_STALL, 0);
3878 : :
3879 : : intel_ring_advance(request, cs);
3880 : :
3881 : : return 0;
3882 : : }
3883 : :
3884 : 0 : static int gen11_emit_flush_render(struct i915_request *request,
3885 : : u32 mode)
3886 : : {
3887 [ # # ]: 0 : if (mode & EMIT_FLUSH) {
3888 : 0 : u32 *cs;
3889 : 0 : u32 flags = 0;
3890 : :
3891 : 0 : flags |= PIPE_CONTROL_CS_STALL;
3892 : :
3893 : 0 : flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3894 : 0 : flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3895 : 0 : flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3896 : 0 : flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3897 : 0 : flags |= PIPE_CONTROL_FLUSH_ENABLE;
3898 : 0 : flags |= PIPE_CONTROL_QW_WRITE;
3899 : 0 : flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3900 : :
3901 : 0 : cs = intel_ring_begin(request, 6);
3902 [ # # ]: 0 : if (IS_ERR(cs))
3903 : 0 : return PTR_ERR(cs);
3904 : :
3905 : 0 : cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3906 : 0 : intel_ring_advance(request, cs);
3907 : : }
3908 : :
3909 [ # # ]: 0 : if (mode & EMIT_INVALIDATE) {
3910 : 0 : u32 *cs;
3911 : 0 : u32 flags = 0;
3912 : :
3913 : 0 : flags |= PIPE_CONTROL_CS_STALL;
3914 : :
3915 : 0 : flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3916 : 0 : flags |= PIPE_CONTROL_TLB_INVALIDATE;
3917 : 0 : flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3918 : 0 : flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3919 : 0 : flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3920 : 0 : flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3921 : 0 : flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3922 : 0 : flags |= PIPE_CONTROL_QW_WRITE;
3923 : 0 : flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3924 : :
3925 : 0 : cs = intel_ring_begin(request, 6);
3926 [ # # ]: 0 : if (IS_ERR(cs))
3927 : 0 : return PTR_ERR(cs);
3928 : :
3929 : 0 : cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3930 : 0 : intel_ring_advance(request, cs);
3931 : : }
3932 : :
3933 : : return 0;
3934 : : }
3935 : :
3936 : 0 : static u32 preparser_disable(bool state)
3937 : : {
3938 : 0 : return MI_ARB_CHECK | 1 << 8 | state;
3939 : : }
3940 : :
3941 : 0 : static int gen12_emit_flush_render(struct i915_request *request,
3942 : : u32 mode)
3943 : : {
3944 [ # # ]: 0 : if (mode & EMIT_FLUSH) {
3945 : 0 : u32 flags = 0;
3946 : 0 : u32 *cs;
3947 : :
3948 : 0 : flags |= PIPE_CONTROL_TILE_CACHE_FLUSH;
3949 : 0 : flags |= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
3950 : 0 : flags |= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
3951 : : /* Wa_1409600907:tgl */
3952 : 0 : flags |= PIPE_CONTROL_DEPTH_STALL;
3953 : 0 : flags |= PIPE_CONTROL_DC_FLUSH_ENABLE;
3954 : 0 : flags |= PIPE_CONTROL_FLUSH_ENABLE;
3955 : 0 : flags |= PIPE_CONTROL_HDC_PIPELINE_FLUSH;
3956 : :
3957 : 0 : flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3958 : 0 : flags |= PIPE_CONTROL_QW_WRITE;
3959 : :
3960 : 0 : flags |= PIPE_CONTROL_CS_STALL;
3961 : :
3962 : 0 : cs = intel_ring_begin(request, 6);
3963 [ # # ]: 0 : if (IS_ERR(cs))
3964 : 0 : return PTR_ERR(cs);
3965 : :
3966 : 0 : cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
3967 : 0 : intel_ring_advance(request, cs);
3968 : : }
3969 : :
3970 [ # # ]: 0 : if (mode & EMIT_INVALIDATE) {
3971 : 0 : u32 flags = 0;
3972 : 0 : u32 *cs;
3973 : :
3974 : 0 : flags |= PIPE_CONTROL_COMMAND_CACHE_INVALIDATE;
3975 : 0 : flags |= PIPE_CONTROL_TLB_INVALIDATE;
3976 : 0 : flags |= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
3977 : 0 : flags |= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
3978 : 0 : flags |= PIPE_CONTROL_VF_CACHE_INVALIDATE;
3979 : 0 : flags |= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
3980 : 0 : flags |= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
3981 : 0 : flags |= PIPE_CONTROL_L3_RO_CACHE_INVALIDATE;
3982 : :
3983 : 0 : flags |= PIPE_CONTROL_STORE_DATA_INDEX;
3984 : 0 : flags |= PIPE_CONTROL_QW_WRITE;
3985 : :
3986 : 0 : flags |= PIPE_CONTROL_CS_STALL;
3987 : :
3988 : 0 : cs = intel_ring_begin(request, 8);
3989 [ # # ]: 0 : if (IS_ERR(cs))
3990 : 0 : return PTR_ERR(cs);
3991 : :
3992 : : /*
3993 : : * Prevent the pre-parser from skipping past the TLB
3994 : : * invalidate and loading a stale page for the batch
3995 : : * buffer / request payload.
3996 : : */
3997 : 0 : *cs++ = preparser_disable(true);
3998 : :
3999 : 0 : cs = gen8_emit_pipe_control(cs, flags, LRC_PPHWSP_SCRATCH_ADDR);
4000 : :
4001 : 0 : *cs++ = preparser_disable(false);
4002 : 0 : intel_ring_advance(request, cs);
4003 : : }
4004 : :
4005 : : return 0;
4006 : : }
4007 : :
4008 : : /*
4009 : : * Reserve space for 2 NOOPs at the end of each request to be
4010 : : * used as a workaround for not being allowed to do lite
4011 : : * restore with HEAD==TAIL (WaIdleLiteRestore).
4012 : : */
4013 : 0 : static u32 *gen8_emit_wa_tail(struct i915_request *request, u32 *cs)
4014 : : {
4015 : : /* Ensure there's always at least one preemption point per-request. */
4016 : 0 : *cs++ = MI_ARB_CHECK;
4017 : 0 : *cs++ = MI_NOOP;
4018 : 0 : request->wa_tail = intel_ring_offset(request, cs);
4019 : :
4020 : 0 : return cs;
4021 : : }
4022 : :
4023 : 0 : static u32 *emit_preempt_busywait(struct i915_request *request, u32 *cs)
4024 : : {
4025 : 0 : *cs++ = MI_SEMAPHORE_WAIT |
4026 : : MI_SEMAPHORE_GLOBAL_GTT |
4027 : : MI_SEMAPHORE_POLL |
4028 : : MI_SEMAPHORE_SAD_EQ_SDD;
4029 : 0 : *cs++ = 0;
4030 : 0 : *cs++ = intel_hws_preempt_address(request->engine);
4031 : 0 : *cs++ = 0;
4032 : :
4033 : 0 : return cs;
4034 : : }
4035 : :
4036 : : static __always_inline u32*
4037 : 0 : gen8_emit_fini_breadcrumb_footer(struct i915_request *request,
4038 : : u32 *cs)
4039 : : {
4040 : 0 : *cs++ = MI_USER_INTERRUPT;
4041 : :
4042 : 0 : *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4043 : 0 : if (intel_engine_has_semaphores(request->engine))
4044 : 0 : cs = emit_preempt_busywait(request, cs);
4045 : :
4046 : 0 : request->tail = intel_ring_offset(request, cs);
4047 : 0 : assert_ring_tail_valid(request->ring, request->tail);
4048 : :
4049 : 0 : return gen8_emit_wa_tail(request, cs);
4050 : : }
4051 : :
4052 : 0 : static u32 *gen8_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4053 : : {
4054 : 0 : cs = gen8_emit_ggtt_write(cs,
4055 [ # # ]: 0 : request->fence.seqno,
4056 [ # # ]: 0 : i915_request_active_timeline(request)->hwsp_offset,
4057 : : 0);
4058 : :
4059 [ # # ]: 0 : return gen8_emit_fini_breadcrumb_footer(request, cs);
4060 : : }
4061 : :
4062 : 0 : static u32 *gen8_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4063 : : {
4064 [ # # ]: 0 : cs = gen8_emit_pipe_control(cs,
4065 : : PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4066 : : PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4067 : : PIPE_CONTROL_DC_FLUSH_ENABLE,
4068 : : 0);
4069 : :
4070 : : /* XXX flush+write+CS_STALL all in one upsets gem_concurrent_blt:kbl */
4071 : 0 : cs = gen8_emit_ggtt_write_rcs(cs,
4072 [ # # ]: 0 : request->fence.seqno,
4073 [ # # ]: 0 : i915_request_active_timeline(request)->hwsp_offset,
4074 : : PIPE_CONTROL_FLUSH_ENABLE |
4075 : : PIPE_CONTROL_CS_STALL);
4076 : :
4077 [ # # ]: 0 : return gen8_emit_fini_breadcrumb_footer(request, cs);
4078 : : }
4079 : :
4080 : : static u32 *
4081 : 0 : gen11_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4082 : : {
4083 : 0 : cs = gen8_emit_ggtt_write_rcs(cs,
4084 [ # # ]: 0 : request->fence.seqno,
4085 [ # # ]: 0 : i915_request_active_timeline(request)->hwsp_offset,
4086 : : PIPE_CONTROL_CS_STALL |
4087 : : PIPE_CONTROL_TILE_CACHE_FLUSH |
4088 : : PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4089 : : PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4090 : : PIPE_CONTROL_DC_FLUSH_ENABLE |
4091 : : PIPE_CONTROL_FLUSH_ENABLE);
4092 : :
4093 [ # # ]: 0 : return gen8_emit_fini_breadcrumb_footer(request, cs);
4094 : : }
4095 : :
4096 : : /*
4097 : : * Note that the CS instruction pre-parser will not stall on the breadcrumb
4098 : : * flush and will continue pre-fetching the instructions after it before the
4099 : : * memory sync is completed. On pre-gen12 HW, the pre-parser will stop at
4100 : : * BB_START/END instructions, so, even though we might pre-fetch the pre-amble
4101 : : * of the next request before the memory has been flushed, we're guaranteed that
4102 : : * we won't access the batch itself too early.
4103 : : * However, on gen12+ the parser can pre-fetch across the BB_START/END commands,
4104 : : * so, if the current request is modifying an instruction in the next request on
4105 : : * the same intel_context, we might pre-fetch and then execute the pre-update
4106 : : * instruction. To avoid this, the users of self-modifying code should either
4107 : : * disable the parser around the code emitting the memory writes, via a new flag
4108 : : * added to MI_ARB_CHECK, or emit the writes from a different intel_context. For
4109 : : * the in-kernel use-cases we've opted to use a separate context, see
4110 : : * reloc_gpu() as an example.
4111 : : * All the above applies only to the instructions themselves. Non-inline data
4112 : : * used by the instructions is not pre-fetched.
4113 : : */
4114 : :
4115 : 0 : static u32 *gen12_emit_preempt_busywait(struct i915_request *request, u32 *cs)
4116 : : {
4117 : 0 : *cs++ = MI_SEMAPHORE_WAIT_TOKEN |
4118 : : MI_SEMAPHORE_GLOBAL_GTT |
4119 : : MI_SEMAPHORE_POLL |
4120 : : MI_SEMAPHORE_SAD_EQ_SDD;
4121 : 0 : *cs++ = 0;
4122 : 0 : *cs++ = intel_hws_preempt_address(request->engine);
4123 : 0 : *cs++ = 0;
4124 : 0 : *cs++ = 0;
4125 : 0 : *cs++ = MI_NOOP;
4126 : :
4127 : 0 : return cs;
4128 : : }
4129 : :
4130 : : static __always_inline u32*
4131 : 0 : gen12_emit_fini_breadcrumb_footer(struct i915_request *request, u32 *cs)
4132 : : {
4133 : 0 : *cs++ = MI_USER_INTERRUPT;
4134 : :
4135 : 0 : *cs++ = MI_ARB_ON_OFF | MI_ARB_ENABLE;
4136 : 0 : if (intel_engine_has_semaphores(request->engine))
4137 : 0 : cs = gen12_emit_preempt_busywait(request, cs);
4138 : :
4139 : 0 : request->tail = intel_ring_offset(request, cs);
4140 : 0 : assert_ring_tail_valid(request->ring, request->tail);
4141 : :
4142 : 0 : return gen8_emit_wa_tail(request, cs);
4143 : : }
4144 : :
4145 : 0 : static u32 *gen12_emit_fini_breadcrumb(struct i915_request *request, u32 *cs)
4146 : : {
4147 : 0 : cs = gen8_emit_ggtt_write(cs,
4148 [ # # ]: 0 : request->fence.seqno,
4149 [ # # ]: 0 : i915_request_active_timeline(request)->hwsp_offset,
4150 : : 0);
4151 : :
4152 [ # # ]: 0 : return gen12_emit_fini_breadcrumb_footer(request, cs);
4153 : : }
4154 : :
4155 : : static u32 *
4156 : 0 : gen12_emit_fini_breadcrumb_rcs(struct i915_request *request, u32 *cs)
4157 : : {
4158 : 0 : cs = gen8_emit_ggtt_write_rcs(cs,
4159 [ # # ]: 0 : request->fence.seqno,
4160 [ # # ]: 0 : i915_request_active_timeline(request)->hwsp_offset,
4161 : : PIPE_CONTROL_CS_STALL |
4162 : : PIPE_CONTROL_TILE_CACHE_FLUSH |
4163 : : PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
4164 : : PIPE_CONTROL_DEPTH_CACHE_FLUSH |
4165 : : /* Wa_1409600907:tgl */
4166 : : PIPE_CONTROL_DEPTH_STALL |
4167 : : PIPE_CONTROL_DC_FLUSH_ENABLE |
4168 : : PIPE_CONTROL_FLUSH_ENABLE |
4169 : : PIPE_CONTROL_HDC_PIPELINE_FLUSH);
4170 : :
4171 [ # # ]: 0 : return gen12_emit_fini_breadcrumb_footer(request, cs);
4172 : : }
4173 : :
4174 : 0 : static void execlists_park(struct intel_engine_cs *engine)
4175 : : {
4176 : 0 : cancel_timer(&engine->execlists.timer);
4177 : 0 : cancel_timer(&engine->execlists.preempt);
4178 : 0 : }
4179 : :
4180 : 0 : void intel_execlists_set_default_submission(struct intel_engine_cs *engine)
4181 : : {
4182 : 0 : engine->submit_request = execlists_submit_request;
4183 : 0 : engine->schedule = i915_schedule;
4184 : 0 : engine->execlists.tasklet.func = execlists_submission_tasklet;
4185 : :
4186 : 0 : engine->reset.prepare = execlists_reset_prepare;
4187 : 0 : engine->reset.rewind = execlists_reset_rewind;
4188 : 0 : engine->reset.cancel = execlists_reset_cancel;
4189 : 0 : engine->reset.finish = execlists_reset_finish;
4190 : :
4191 : 0 : engine->park = execlists_park;
4192 : 0 : engine->unpark = NULL;
4193 : :
4194 : 0 : engine->flags |= I915_ENGINE_SUPPORTS_STATS;
4195 [ # # ]: 0 : if (!intel_vgpu_active(engine->i915)) {
4196 : 0 : engine->flags |= I915_ENGINE_HAS_SEMAPHORES;
4197 [ # # ]: 0 : if (HAS_LOGICAL_RING_PREEMPTION(engine->i915))
4198 : 0 : engine->flags |= I915_ENGINE_HAS_PREEMPTION;
4199 : : }
4200 : :
4201 [ # # ]: 0 : if (INTEL_GEN(engine->i915) >= 12)
4202 : 0 : engine->flags |= I915_ENGINE_HAS_RELATIVE_MMIO;
4203 : :
4204 [ # # ]: 0 : if (intel_engine_has_preemption(engine))
4205 : 0 : engine->emit_bb_start = gen8_emit_bb_start;
4206 : : else
4207 : 0 : engine->emit_bb_start = gen8_emit_bb_start_noarb;
4208 : 0 : }
4209 : :
4210 : 0 : static void execlists_shutdown(struct intel_engine_cs *engine)
4211 : : {
4212 : : /* Synchronise with residual timers and any softirq they raise */
4213 : 0 : del_timer_sync(&engine->execlists.timer);
4214 : 0 : del_timer_sync(&engine->execlists.preempt);
4215 : 0 : tasklet_kill(&engine->execlists.tasklet);
4216 : 0 : }
4217 : :
4218 : 0 : static void execlists_release(struct intel_engine_cs *engine)
4219 : : {
4220 : 0 : execlists_shutdown(engine);
4221 : :
4222 : 0 : intel_engine_cleanup_common(engine);
4223 : 0 : lrc_destroy_wa_ctx(engine);
4224 : 0 : }
4225 : :
4226 : : static void
4227 : 0 : logical_ring_default_vfuncs(struct intel_engine_cs *engine)
4228 : : {
4229 : : /* Default vfuncs which can be overriden by each engine. */
4230 : :
4231 : 0 : engine->resume = execlists_resume;
4232 : :
4233 : 0 : engine->cops = &execlists_context_ops;
4234 : 0 : engine->request_alloc = execlists_request_alloc;
4235 : :
4236 : 0 : engine->emit_flush = gen8_emit_flush;
4237 : 0 : engine->emit_init_breadcrumb = gen8_emit_init_breadcrumb;
4238 : 0 : engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb;
4239 : 0 : if (INTEL_GEN(engine->i915) >= 12)
4240 : 0 : engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb;
4241 : :
4242 : 0 : engine->set_default_submission = intel_execlists_set_default_submission;
4243 : :
4244 [ # # ]: 0 : if (INTEL_GEN(engine->i915) < 11) {
4245 : 0 : engine->irq_enable = gen8_logical_ring_enable_irq;
4246 : 0 : engine->irq_disable = gen8_logical_ring_disable_irq;
4247 : : } else {
4248 : : /*
4249 : : * TODO: On Gen11 interrupt masks need to be clear
4250 : : * to allow C6 entry. Keep interrupts enabled at
4251 : : * and take the hit of generating extra interrupts
4252 : : * until a more refined solution exists.
4253 : : */
4254 : 0 : }
4255 : : }
4256 : :
4257 : : static inline void
4258 : 0 : logical_ring_default_irqs(struct intel_engine_cs *engine)
4259 : : {
4260 : 0 : unsigned int shift = 0;
4261 : :
4262 : 0 : if (INTEL_GEN(engine->i915) < 11) {
4263 : 0 : const u8 irq_shifts[] = {
4264 : : [RCS0] = GEN8_RCS_IRQ_SHIFT,
4265 : : [BCS0] = GEN8_BCS_IRQ_SHIFT,
4266 : : [VCS0] = GEN8_VCS0_IRQ_SHIFT,
4267 : : [VCS1] = GEN8_VCS1_IRQ_SHIFT,
4268 : : [VECS0] = GEN8_VECS_IRQ_SHIFT,
4269 : : };
4270 : :
4271 : 0 : shift = irq_shifts[engine->id];
4272 : : }
4273 : :
4274 : 0 : engine->irq_enable_mask = GT_RENDER_USER_INTERRUPT << shift;
4275 : 0 : engine->irq_keep_mask = GT_CONTEXT_SWITCH_INTERRUPT << shift;
4276 : : }
4277 : :
4278 : 0 : static void rcs_submission_override(struct intel_engine_cs *engine)
4279 : : {
4280 : 0 : switch (INTEL_GEN(engine->i915)) {
4281 : 0 : case 12:
4282 : 0 : engine->emit_flush = gen12_emit_flush_render;
4283 : 0 : engine->emit_fini_breadcrumb = gen12_emit_fini_breadcrumb_rcs;
4284 : 0 : break;
4285 : 0 : case 11:
4286 : 0 : engine->emit_flush = gen11_emit_flush_render;
4287 : 0 : engine->emit_fini_breadcrumb = gen11_emit_fini_breadcrumb_rcs;
4288 : 0 : break;
4289 : 0 : default:
4290 : 0 : engine->emit_flush = gen8_emit_flush_render;
4291 : 0 : engine->emit_fini_breadcrumb = gen8_emit_fini_breadcrumb_rcs;
4292 : 0 : break;
4293 : : }
4294 : : }
4295 : :
4296 : 0 : int intel_execlists_submission_setup(struct intel_engine_cs *engine)
4297 : : {
4298 : 0 : struct intel_engine_execlists * const execlists = &engine->execlists;
4299 : 0 : struct drm_i915_private *i915 = engine->i915;
4300 : 0 : struct intel_uncore *uncore = engine->uncore;
4301 : 0 : u32 base = engine->mmio_base;
4302 : :
4303 : 0 : tasklet_init(&engine->execlists.tasklet,
4304 : : execlists_submission_tasklet, (unsigned long)engine);
4305 : 0 : timer_setup(&engine->execlists.timer, execlists_timeslice, 0);
4306 : 0 : timer_setup(&engine->execlists.preempt, execlists_preempt, 0);
4307 : :
4308 [ # # ]: 0 : logical_ring_default_vfuncs(engine);
4309 [ # # ]: 0 : logical_ring_default_irqs(engine);
4310 : :
4311 [ # # ]: 0 : if (engine->class == RENDER_CLASS)
4312 [ # # # ]: 0 : rcs_submission_override(engine);
4313 : :
4314 [ # # ]: 0 : if (intel_init_workaround_bb(engine))
4315 : : /*
4316 : : * We continue even if we fail to initialize WA batch
4317 : : * because we only expect rare glitches but nothing
4318 : : * critical to prevent us from using GPU
4319 : : */
4320 : 0 : DRM_ERROR("WA batch buffer initialization failed\n");
4321 : :
4322 [ # # ]: 0 : if (HAS_LOGICAL_RING_ELSQ(i915)) {
4323 : 0 : execlists->submit_reg = uncore->regs +
4324 : 0 : i915_mmio_reg_offset(RING_EXECLIST_SQ_CONTENTS(base));
4325 : 0 : execlists->ctrl_reg = uncore->regs +
4326 : 0 : i915_mmio_reg_offset(RING_EXECLIST_CONTROL(base));
4327 : : } else {
4328 : 0 : execlists->submit_reg = uncore->regs +
4329 : 0 : i915_mmio_reg_offset(RING_ELSP(base));
4330 : : }
4331 : :
4332 : 0 : execlists->csb_status =
4333 : 0 : &engine->status_page.addr[I915_HWS_CSB_BUF0_INDEX];
4334 : :
4335 : 0 : execlists->csb_write =
4336 [ # # ]: 0 : &engine->status_page.addr[intel_hws_csb_write_index(i915)];
4337 : :
4338 [ # # ]: 0 : if (INTEL_GEN(i915) < 11)
4339 : 0 : execlists->csb_size = GEN8_CSB_ENTRIES;
4340 : : else
4341 : 0 : execlists->csb_size = GEN11_CSB_ENTRIES;
4342 : :
4343 : 0 : reset_csb_pointers(engine);
4344 : :
4345 : : /* Finally, take ownership and responsibility for cleanup! */
4346 : 0 : engine->release = execlists_release;
4347 : :
4348 : 0 : return 0;
4349 : : }
4350 : :
4351 : : static u32 intel_lr_indirect_ctx_offset(const struct intel_engine_cs *engine)
4352 : : {
4353 : : u32 indirect_ctx_offset;
4354 : :
4355 : : switch (INTEL_GEN(engine->i915)) {
4356 : : default:
4357 : : MISSING_CASE(INTEL_GEN(engine->i915));
4358 : : /* fall through */
4359 : : case 12:
4360 : : indirect_ctx_offset =
4361 : : GEN12_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4362 : : break;
4363 : : case 11:
4364 : : indirect_ctx_offset =
4365 : : GEN11_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4366 : : break;
4367 : : case 10:
4368 : : indirect_ctx_offset =
4369 : : GEN10_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4370 : : break;
4371 : : case 9:
4372 : : indirect_ctx_offset =
4373 : : GEN9_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4374 : : break;
4375 : : case 8:
4376 : : indirect_ctx_offset =
4377 : : GEN8_CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT;
4378 : : break;
4379 : : }
4380 : :
4381 : : return indirect_ctx_offset;
4382 : : }
4383 : :
4384 : :
4385 : 0 : static void init_common_reg_state(u32 * const regs,
4386 : : const struct intel_engine_cs *engine,
4387 : : const struct intel_ring *ring,
4388 : : bool inhibit)
4389 : : {
4390 : 0 : u32 ctl;
4391 : :
4392 : 0 : ctl = _MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH);
4393 : 0 : ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT);
4394 : 0 : if (inhibit)
4395 : 0 : ctl |= CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT;
4396 [ # # ]: 0 : if (INTEL_GEN(engine->i915) < 11)
4397 : 0 : ctl |= _MASKED_BIT_DISABLE(CTX_CTRL_ENGINE_CTX_SAVE_INHIBIT |
4398 : : CTX_CTRL_RS_CTX_ENABLE);
4399 : 0 : regs[CTX_CONTEXT_CONTROL] = ctl;
4400 : :
4401 : 0 : regs[CTX_RING_CTL] = RING_CTL_SIZE(ring->size) | RING_VALID;
4402 : : }
4403 : :
4404 : 0 : static void init_wa_bb_reg_state(u32 * const regs,
4405 : : const struct intel_engine_cs *engine,
4406 : : u32 pos_bb_per_ctx)
4407 : : {
4408 : 0 : const struct i915_ctx_workarounds * const wa_ctx = &engine->wa_ctx;
4409 : :
4410 [ # # ]: 0 : if (wa_ctx->per_ctx.size) {
4411 : 0 : const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4412 : :
4413 : 0 : regs[pos_bb_per_ctx] =
4414 : 0 : (ggtt_offset + wa_ctx->per_ctx.offset) | 0x01;
4415 : : }
4416 : :
4417 [ # # ]: 0 : if (wa_ctx->indirect_ctx.size) {
4418 : 0 : const u32 ggtt_offset = i915_ggtt_offset(wa_ctx->vma);
4419 : :
4420 : 0 : regs[pos_bb_per_ctx + 2] =
4421 : 0 : (ggtt_offset + wa_ctx->indirect_ctx.offset) |
4422 : 0 : (wa_ctx->indirect_ctx.size / CACHELINE_BYTES);
4423 : :
4424 : 0 : regs[pos_bb_per_ctx + 4] =
4425 : 0 : intel_lr_indirect_ctx_offset(engine) << 6;
4426 : : }
4427 : 0 : }
4428 : :
4429 : 0 : static void init_ppgtt_reg_state(u32 *regs, const struct i915_ppgtt *ppgtt)
4430 : : {
4431 [ # # ]: 0 : if (i915_vm_is_4lvl(&ppgtt->vm)) {
4432 : : /* 64b PPGTT (48bit canonical)
4433 : : * PDP0_DESCRIPTOR contains the base address to PML4 and
4434 : : * other PDP Descriptors are ignored.
4435 : : */
4436 : 0 : ASSIGN_CTX_PML4(ppgtt, regs);
4437 : : } else {
4438 [ # # ]: 0 : ASSIGN_CTX_PDP(ppgtt, regs, 3);
4439 [ # # ]: 0 : ASSIGN_CTX_PDP(ppgtt, regs, 2);
4440 [ # # ]: 0 : ASSIGN_CTX_PDP(ppgtt, regs, 1);
4441 [ # # ]: 0 : ASSIGN_CTX_PDP(ppgtt, regs, 0);
4442 : : }
4443 : 0 : }
4444 : :
4445 : 0 : static struct i915_ppgtt *vm_alias(struct i915_address_space *vm)
4446 : : {
4447 : 0 : if (i915_is_ggtt(vm))
4448 : 0 : return i915_vm_to_ggtt(vm)->alias;
4449 : : else
4450 : : return i915_vm_to_ppgtt(vm);
4451 : : }
4452 : :
4453 : 0 : static void execlists_init_reg_state(u32 *regs,
4454 : : const struct intel_context *ce,
4455 : : const struct intel_engine_cs *engine,
4456 : : const struct intel_ring *ring,
4457 : : bool inhibit)
4458 : : {
4459 : : /*
4460 : : * A context is actually a big batch buffer with several
4461 : : * MI_LOAD_REGISTER_IMM commands followed by (reg, value) pairs. The
4462 : : * values we are setting here are only for the first context restore:
4463 : : * on a subsequent save, the GPU will recreate this batchbuffer with new
4464 : : * values (including all the missing MI_LOAD_REGISTER_IMM commands that
4465 : : * we are not initializing here).
4466 : : *
4467 : : * Must keep consistent with virtual_update_register_offsets().
4468 : : */
4469 [ # # ]: 0 : set_offsets(regs, reg_offsets(engine), engine, inhibit);
4470 : :
4471 [ # # ]: 0 : init_common_reg_state(regs, engine, ring, inhibit);
4472 [ # # ]: 0 : init_ppgtt_reg_state(regs, vm_alias(ce->vm));
4473 : :
4474 : 0 : init_wa_bb_reg_state(regs, engine,
4475 [ # # ]: 0 : INTEL_GEN(engine->i915) >= 12 ?
4476 : : GEN12_CTX_BB_PER_CTX_PTR :
4477 : : CTX_BB_PER_CTX_PTR);
4478 : :
4479 [ # # ]: 0 : __reset_stop_ring(regs, engine);
4480 : 0 : }
4481 : :
4482 : : static int
4483 : 0 : populate_lr_context(struct intel_context *ce,
4484 : : struct drm_i915_gem_object *ctx_obj,
4485 : : struct intel_engine_cs *engine,
4486 : : struct intel_ring *ring)
4487 : : {
4488 : 0 : bool inhibit = true;
4489 : 0 : void *vaddr;
4490 : 0 : int ret;
4491 : :
4492 : 0 : vaddr = i915_gem_object_pin_map(ctx_obj, I915_MAP_WB);
4493 [ # # ]: 0 : if (IS_ERR(vaddr)) {
4494 : 0 : ret = PTR_ERR(vaddr);
4495 : 0 : DRM_DEBUG_DRIVER("Could not map object pages! (%d)\n", ret);
4496 : 0 : return ret;
4497 : : }
4498 : :
4499 : 0 : set_redzone(vaddr, engine);
4500 : :
4501 [ # # ]: 0 : if (engine->default_state) {
4502 : 0 : void *defaults;
4503 : :
4504 : 0 : defaults = i915_gem_object_pin_map(engine->default_state,
4505 : : I915_MAP_WB);
4506 [ # # ]: 0 : if (IS_ERR(defaults)) {
4507 : 0 : ret = PTR_ERR(defaults);
4508 : 0 : goto err_unpin_ctx;
4509 : : }
4510 : :
4511 : 0 : memcpy(vaddr, defaults, engine->context_size);
4512 : 0 : i915_gem_object_unpin_map(engine->default_state);
4513 : 0 : __set_bit(CONTEXT_VALID_BIT, &ce->flags);
4514 : 0 : inhibit = false;
4515 : : }
4516 : :
4517 : : /* The second page of the context object contains some fields which must
4518 : : * be set up prior to the first execution. */
4519 : 0 : execlists_init_reg_state(vaddr + LRC_STATE_PN * PAGE_SIZE,
4520 : : ce, engine, ring, inhibit);
4521 : :
4522 : 0 : ret = 0;
4523 : 0 : err_unpin_ctx:
4524 : 0 : __i915_gem_object_flush_map(ctx_obj, 0, engine->context_size);
4525 : 0 : i915_gem_object_unpin_map(ctx_obj);
4526 : 0 : return ret;
4527 : : }
4528 : :
4529 : 0 : static int __execlists_context_alloc(struct intel_context *ce,
4530 : : struct intel_engine_cs *engine)
4531 : : {
4532 : 0 : struct drm_i915_gem_object *ctx_obj;
4533 : 0 : struct intel_ring *ring;
4534 : 0 : struct i915_vma *vma;
4535 : 0 : u32 context_size;
4536 : 0 : int ret;
4537 : :
4538 : 0 : GEM_BUG_ON(ce->state);
4539 : 0 : context_size = round_up(engine->context_size, I915_GTT_PAGE_SIZE);
4540 : :
4541 : 0 : if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
4542 : : context_size += I915_GTT_PAGE_SIZE; /* for redzone */
4543 : :
4544 : 0 : ctx_obj = i915_gem_object_create_shmem(engine->i915, context_size);
4545 [ # # ]: 0 : if (IS_ERR(ctx_obj))
4546 : 0 : return PTR_ERR(ctx_obj);
4547 : :
4548 : 0 : vma = i915_vma_instance(ctx_obj, &engine->gt->ggtt->vm, NULL);
4549 [ # # ]: 0 : if (IS_ERR(vma)) {
4550 : 0 : ret = PTR_ERR(vma);
4551 : 0 : goto error_deref_obj;
4552 : : }
4553 : :
4554 [ # # ]: 0 : if (!ce->timeline) {
4555 : 0 : struct intel_timeline *tl;
4556 : :
4557 : 0 : tl = intel_timeline_create(engine->gt, NULL);
4558 [ # # ]: 0 : if (IS_ERR(tl)) {
4559 : 0 : ret = PTR_ERR(tl);
4560 : 0 : goto error_deref_obj;
4561 : : }
4562 : :
4563 : 0 : ce->timeline = tl;
4564 : : }
4565 : :
4566 : 0 : ring = intel_engine_create_ring(engine, (unsigned long)ce->ring);
4567 [ # # ]: 0 : if (IS_ERR(ring)) {
4568 : 0 : ret = PTR_ERR(ring);
4569 : 0 : goto error_deref_obj;
4570 : : }
4571 : :
4572 : 0 : ret = populate_lr_context(ce, ctx_obj, engine, ring);
4573 [ # # ]: 0 : if (ret) {
4574 : 0 : DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
4575 : 0 : goto error_ring_free;
4576 : : }
4577 : :
4578 : 0 : ce->ring = ring;
4579 : 0 : ce->state = vma;
4580 : :
4581 : 0 : return 0;
4582 : :
4583 : : error_ring_free:
4584 : 0 : intel_ring_put(ring);
4585 : 0 : error_deref_obj:
4586 : 0 : i915_gem_object_put(ctx_obj);
4587 : 0 : return ret;
4588 : : }
4589 : :
4590 : 0 : static struct list_head *virtual_queue(struct virtual_engine *ve)
4591 : : {
4592 : 0 : return &ve->base.execlists.default_priolist.requests[0];
4593 : : }
4594 : :
4595 : 0 : static void virtual_context_destroy(struct kref *kref)
4596 : : {
4597 : 0 : struct virtual_engine *ve =
4598 : 0 : container_of(kref, typeof(*ve), context.ref);
4599 : 0 : unsigned int n;
4600 : :
4601 : 0 : GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4602 : 0 : GEM_BUG_ON(ve->request);
4603 : 0 : GEM_BUG_ON(ve->context.inflight);
4604 : :
4605 [ # # ]: 0 : for (n = 0; n < ve->num_siblings; n++) {
4606 : 0 : struct intel_engine_cs *sibling = ve->siblings[n];
4607 : 0 : struct rb_node *node = &ve->nodes[sibling->id].rb;
4608 : 0 : unsigned long flags;
4609 : :
4610 [ # # ]: 0 : if (RB_EMPTY_NODE(node))
4611 : 0 : continue;
4612 : :
4613 : 0 : spin_lock_irqsave(&sibling->active.lock, flags);
4614 : :
4615 : : /* Detachment is lazily performed in the execlists tasklet */
4616 [ # # ]: 0 : if (!RB_EMPTY_NODE(node))
4617 : 0 : rb_erase_cached(node, &sibling->execlists.virtual);
4618 : :
4619 : 0 : spin_unlock_irqrestore(&sibling->active.lock, flags);
4620 : : }
4621 : 0 : GEM_BUG_ON(__tasklet_is_scheduled(&ve->base.execlists.tasklet));
4622 : :
4623 [ # # ]: 0 : if (ve->context.state)
4624 : 0 : __execlists_context_fini(&ve->context);
4625 : 0 : intel_context_fini(&ve->context);
4626 : :
4627 : 0 : kfree(ve->bonds);
4628 : 0 : kfree(ve);
4629 : 0 : }
4630 : :
4631 : 0 : static void virtual_engine_initial_hint(struct virtual_engine *ve)
4632 : : {
4633 : 0 : int swp;
4634 : :
4635 : : /*
4636 : : * Pick a random sibling on starting to help spread the load around.
4637 : : *
4638 : : * New contexts are typically created with exactly the same order
4639 : : * of siblings, and often started in batches. Due to the way we iterate
4640 : : * the array of sibling when submitting requests, sibling[0] is
4641 : : * prioritised for dequeuing. If we make sure that sibling[0] is fairly
4642 : : * randomised across the system, we also help spread the load by the
4643 : : * first engine we inspect being different each time.
4644 : : *
4645 : : * NB This does not force us to execute on this engine, it will just
4646 : : * typically be the first we inspect for submission.
4647 : : */
4648 : 0 : swp = prandom_u32_max(ve->num_siblings);
4649 [ # # ]: 0 : if (!swp)
4650 : : return;
4651 : :
4652 : 0 : swap(ve->siblings[swp], ve->siblings[0]);
4653 [ # # ]: 0 : if (!intel_engine_has_relative_mmio(ve->siblings[0]))
4654 : 0 : virtual_update_register_offsets(ve->context.lrc_reg_state,
4655 : : ve->siblings[0]);
4656 : : }
4657 : :
4658 : 0 : static int virtual_context_alloc(struct intel_context *ce)
4659 : : {
4660 : 0 : struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4661 : :
4662 : 0 : return __execlists_context_alloc(ce, ve->siblings[0]);
4663 : : }
4664 : :
4665 : 0 : static int virtual_context_pin(struct intel_context *ce)
4666 : : {
4667 : 0 : struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4668 : 0 : int err;
4669 : :
4670 : : /* Note: we must use a real engine class for setting up reg state */
4671 : 0 : err = __execlists_context_pin(ce, ve->siblings[0]);
4672 [ # # ]: 0 : if (err)
4673 : : return err;
4674 : :
4675 : 0 : virtual_engine_initial_hint(ve);
4676 : 0 : return 0;
4677 : : }
4678 : :
4679 : 0 : static void virtual_context_enter(struct intel_context *ce)
4680 : : {
4681 : 0 : struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4682 : 0 : unsigned int n;
4683 : :
4684 [ # # ]: 0 : for (n = 0; n < ve->num_siblings; n++)
4685 : 0 : intel_engine_pm_get(ve->siblings[n]);
4686 : :
4687 : 0 : intel_timeline_enter(ce->timeline);
4688 : 0 : }
4689 : :
4690 : 0 : static void virtual_context_exit(struct intel_context *ce)
4691 : : {
4692 : 0 : struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
4693 : 0 : unsigned int n;
4694 : :
4695 : 0 : intel_timeline_exit(ce->timeline);
4696 : :
4697 [ # # ]: 0 : for (n = 0; n < ve->num_siblings; n++)
4698 : 0 : intel_engine_pm_put(ve->siblings[n]);
4699 : 0 : }
4700 : :
4701 : : static const struct intel_context_ops virtual_context_ops = {
4702 : : .alloc = virtual_context_alloc,
4703 : :
4704 : : .pin = virtual_context_pin,
4705 : : .unpin = execlists_context_unpin,
4706 : :
4707 : : .enter = virtual_context_enter,
4708 : : .exit = virtual_context_exit,
4709 : :
4710 : : .destroy = virtual_context_destroy,
4711 : : };
4712 : :
4713 : 0 : static intel_engine_mask_t virtual_submission_mask(struct virtual_engine *ve)
4714 : : {
4715 : 0 : struct i915_request *rq;
4716 : 0 : intel_engine_mask_t mask;
4717 : :
4718 : 0 : rq = READ_ONCE(ve->request);
4719 [ # # ]: 0 : if (!rq)
4720 : : return 0;
4721 : :
4722 : : /* The rq is ready for submission; rq->execution_mask is now stable. */
4723 : 0 : mask = rq->execution_mask;
4724 [ # # ]: 0 : if (unlikely(!mask)) {
4725 : : /* Invalid selection, submit to a random engine in error */
4726 : 0 : i915_request_skip(rq, -ENODEV);
4727 : 0 : mask = ve->siblings[0]->mask;
4728 : : }
4729 : :
4730 : : ENGINE_TRACE(&ve->base, "rq=%llx:%lld, mask=%x, prio=%d\n",
4731 : : rq->fence.context, rq->fence.seqno,
4732 : : mask, ve->base.execlists.queue_priority_hint);
4733 : :
4734 : : return mask;
4735 : : }
4736 : :
4737 : 0 : static void virtual_submission_tasklet(unsigned long data)
4738 : : {
4739 : 0 : struct virtual_engine * const ve = (struct virtual_engine *)data;
4740 : 0 : const int prio = ve->base.execlists.queue_priority_hint;
4741 : 0 : intel_engine_mask_t mask;
4742 : 0 : unsigned int n;
4743 : :
4744 : 0 : rcu_read_lock();
4745 [ # # ]: 0 : mask = virtual_submission_mask(ve);
4746 : 0 : rcu_read_unlock();
4747 [ # # ]: 0 : if (unlikely(!mask))
4748 : : return;
4749 : :
4750 : 0 : local_irq_disable();
4751 [ # # # # ]: 0 : for (n = 0; READ_ONCE(ve->request) && n < ve->num_siblings; n++) {
4752 : 0 : struct intel_engine_cs *sibling = ve->siblings[n];
4753 : 0 : struct ve_node * const node = &ve->nodes[sibling->id];
4754 : 0 : struct rb_node **parent, *rb;
4755 : 0 : bool first;
4756 : :
4757 [ # # ]: 0 : if (unlikely(!(mask & sibling->mask))) {
4758 [ # # ]: 0 : if (!RB_EMPTY_NODE(&node->rb)) {
4759 : 0 : spin_lock(&sibling->active.lock);
4760 : 0 : rb_erase_cached(&node->rb,
4761 : : &sibling->execlists.virtual);
4762 : 0 : RB_CLEAR_NODE(&node->rb);
4763 : 0 : spin_unlock(&sibling->active.lock);
4764 : : }
4765 : 0 : continue;
4766 : : }
4767 : :
4768 : 0 : spin_lock(&sibling->active.lock);
4769 : :
4770 [ # # ]: 0 : if (!RB_EMPTY_NODE(&node->rb)) {
4771 : : /*
4772 : : * Cheat and avoid rebalancing the tree if we can
4773 : : * reuse this node in situ.
4774 : : */
4775 : 0 : first = rb_first_cached(&sibling->execlists.virtual) ==
4776 : : &node->rb;
4777 [ # # # # : 0 : if (prio == node->prio || (prio > node->prio && first))
# # ]
4778 : 0 : goto submit_engine;
4779 : :
4780 : 0 : rb_erase_cached(&node->rb, &sibling->execlists.virtual);
4781 : : }
4782 : :
4783 : 0 : rb = NULL;
4784 : 0 : first = true;
4785 : 0 : parent = &sibling->execlists.virtual.rb_root.rb_node;
4786 [ # # ]: 0 : while (*parent) {
4787 : 0 : struct ve_node *other;
4788 : :
4789 : 0 : rb = *parent;
4790 : 0 : other = rb_entry(rb, typeof(*other), rb);
4791 [ # # ]: 0 : if (prio > other->prio) {
4792 : 0 : parent = &rb->rb_left;
4793 : : } else {
4794 : 0 : parent = &rb->rb_right;
4795 : 0 : first = false;
4796 : : }
4797 : : }
4798 : :
4799 [ # # ]: 0 : rb_link_node(&node->rb, rb, parent);
4800 [ # # ]: 0 : rb_insert_color_cached(&node->rb,
4801 : : &sibling->execlists.virtual,
4802 : : first);
4803 : :
4804 : 0 : submit_engine:
4805 : 0 : GEM_BUG_ON(RB_EMPTY_NODE(&node->rb));
4806 : 0 : node->prio = prio;
4807 [ # # # # ]: 0 : if (first && prio > sibling->execlists.queue_priority_hint) {
4808 : 0 : sibling->execlists.queue_priority_hint = prio;
4809 : 0 : tasklet_hi_schedule(&sibling->execlists.tasklet);
4810 : : }
4811 : :
4812 : 0 : spin_unlock(&sibling->active.lock);
4813 : : }
4814 : 0 : local_irq_enable();
4815 : : }
4816 : :
4817 : 0 : static void virtual_submit_request(struct i915_request *rq)
4818 : : {
4819 : 0 : struct virtual_engine *ve = to_virtual_engine(rq->engine);
4820 : 0 : struct i915_request *old;
4821 : 0 : unsigned long flags;
4822 : :
4823 : 0 : ENGINE_TRACE(&ve->base, "rq=%llx:%lld\n",
4824 : : rq->fence.context,
4825 : : rq->fence.seqno);
4826 : :
4827 : 0 : GEM_BUG_ON(ve->base.submit_request != virtual_submit_request);
4828 : :
4829 : 0 : spin_lock_irqsave(&ve->base.active.lock, flags);
4830 : :
4831 : 0 : old = ve->request;
4832 [ # # ]: 0 : if (old) { /* background completion event from preempt-to-busy */
4833 : 0 : GEM_BUG_ON(!i915_request_completed(old));
4834 : 0 : __i915_request_submit(old);
4835 : 0 : i915_request_put(old);
4836 : : }
4837 : :
4838 [ # # ]: 0 : if (i915_request_completed(rq)) {
4839 : 0 : __i915_request_submit(rq);
4840 : :
4841 : 0 : ve->base.execlists.queue_priority_hint = INT_MIN;
4842 : 0 : ve->request = NULL;
4843 : : } else {
4844 : 0 : ve->base.execlists.queue_priority_hint = rq_prio(rq);
4845 [ # # ]: 0 : ve->request = i915_request_get(rq);
4846 : :
4847 : 0 : GEM_BUG_ON(!list_empty(virtual_queue(ve)));
4848 : 0 : list_move_tail(&rq->sched.link, virtual_queue(ve));
4849 : :
4850 : 0 : tasklet_schedule(&ve->base.execlists.tasklet);
4851 : : }
4852 : :
4853 : 0 : spin_unlock_irqrestore(&ve->base.active.lock, flags);
4854 : 0 : }
4855 : :
4856 : : static struct ve_bond *
4857 : 0 : virtual_find_bond(struct virtual_engine *ve,
4858 : : const struct intel_engine_cs *master)
4859 : : {
4860 : : int i;
4861 : :
4862 [ # # # # ]: 0 : for (i = 0; i < ve->num_bonds; i++) {
4863 [ # # # # ]: 0 : if (ve->bonds[i].master == master)
4864 : : return &ve->bonds[i];
4865 : : }
4866 : :
4867 : : return NULL;
4868 : : }
4869 : :
4870 : : static void
4871 : 0 : virtual_bond_execute(struct i915_request *rq, struct dma_fence *signal)
4872 : : {
4873 : 0 : struct virtual_engine *ve = to_virtual_engine(rq->engine);
4874 : 0 : intel_engine_mask_t allowed, exec;
4875 : 0 : struct ve_bond *bond;
4876 : :
4877 : 0 : allowed = ~to_request(signal)->engine->mask;
4878 : :
4879 : 0 : bond = virtual_find_bond(ve, to_request(signal)->engine);
4880 [ # # ]: 0 : if (bond)
4881 : 0 : allowed &= bond->sibling_mask;
4882 : :
4883 : : /* Restrict the bonded request to run on only the available engines */
4884 : 0 : exec = READ_ONCE(rq->execution_mask);
4885 [ # # # # ]: 0 : while (!try_cmpxchg(&rq->execution_mask, &exec, exec & allowed))
4886 : 0 : ;
4887 : :
4888 : : /* Prevent the master from being re-run on the bonded engines */
4889 : 0 : to_request(signal)->execution_mask &= ~allowed;
4890 : 0 : }
4891 : :
4892 : : struct intel_context *
4893 : 0 : intel_execlists_create_virtual(struct intel_engine_cs **siblings,
4894 : : unsigned int count)
4895 : : {
4896 : 0 : struct virtual_engine *ve;
4897 : 0 : unsigned int n;
4898 : 0 : int err;
4899 : :
4900 [ # # ]: 0 : if (count == 0)
4901 : : return ERR_PTR(-EINVAL);
4902 : :
4903 [ # # ]: 0 : if (count == 1)
4904 : 0 : return intel_context_create(siblings[0]);
4905 : :
4906 [ # # ]: 0 : ve = kzalloc(struct_size(ve, siblings, count), GFP_KERNEL);
4907 [ # # ]: 0 : if (!ve)
4908 : : return ERR_PTR(-ENOMEM);
4909 : :
4910 : 0 : ve->base.i915 = siblings[0]->i915;
4911 : 0 : ve->base.gt = siblings[0]->gt;
4912 : 0 : ve->base.uncore = siblings[0]->uncore;
4913 : 0 : ve->base.id = -1;
4914 : :
4915 : 0 : ve->base.class = OTHER_CLASS;
4916 : 0 : ve->base.uabi_class = I915_ENGINE_CLASS_INVALID;
4917 : 0 : ve->base.instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4918 : 0 : ve->base.uabi_instance = I915_ENGINE_CLASS_INVALID_VIRTUAL;
4919 : :
4920 : : /*
4921 : : * The decision on whether to submit a request using semaphores
4922 : : * depends on the saturated state of the engine. We only compute
4923 : : * this during HW submission of the request, and we need for this
4924 : : * state to be globally applied to all requests being submitted
4925 : : * to this engine. Virtual engines encompass more than one physical
4926 : : * engine and so we cannot accurately tell in advance if one of those
4927 : : * engines is already saturated and so cannot afford to use a semaphore
4928 : : * and be pessimized in priority for doing so -- if we are the only
4929 : : * context using semaphores after all other clients have stopped, we
4930 : : * will be starved on the saturated system. Such a global switch for
4931 : : * semaphores is less than ideal, but alas is the current compromise.
4932 : : */
4933 : 0 : ve->base.saturated = ALL_ENGINES;
4934 : :
4935 : 0 : snprintf(ve->base.name, sizeof(ve->base.name), "virtual");
4936 : :
4937 : 0 : intel_engine_init_active(&ve->base, ENGINE_VIRTUAL);
4938 : 0 : intel_engine_init_breadcrumbs(&ve->base);
4939 : 0 : intel_engine_init_execlists(&ve->base);
4940 : :
4941 : 0 : ve->base.cops = &virtual_context_ops;
4942 : 0 : ve->base.request_alloc = execlists_request_alloc;
4943 : :
4944 : 0 : ve->base.schedule = i915_schedule;
4945 : 0 : ve->base.submit_request = virtual_submit_request;
4946 : 0 : ve->base.bond_execute = virtual_bond_execute;
4947 : :
4948 : 0 : INIT_LIST_HEAD(virtual_queue(ve));
4949 : 0 : ve->base.execlists.queue_priority_hint = INT_MIN;
4950 : 0 : tasklet_init(&ve->base.execlists.tasklet,
4951 : : virtual_submission_tasklet,
4952 : : (unsigned long)ve);
4953 : :
4954 : 0 : intel_context_init(&ve->context, &ve->base);
4955 : :
4956 [ # # ]: 0 : for (n = 0; n < count; n++) {
4957 : 0 : struct intel_engine_cs *sibling = siblings[n];
4958 : :
4959 : 0 : GEM_BUG_ON(!is_power_of_2(sibling->mask));
4960 [ # # ]: 0 : if (sibling->mask & ve->base.mask) {
4961 : 0 : DRM_DEBUG("duplicate %s entry in load balancer\n",
4962 : : sibling->name);
4963 : 0 : err = -EINVAL;
4964 : 0 : goto err_put;
4965 : : }
4966 : :
4967 : : /*
4968 : : * The virtual engine implementation is tightly coupled to
4969 : : * the execlists backend -- we push out request directly
4970 : : * into a tree inside each physical engine. We could support
4971 : : * layering if we handle cloning of the requests and
4972 : : * submitting a copy into each backend.
4973 : : */
4974 [ # # ]: 0 : if (sibling->execlists.tasklet.func !=
4975 : : execlists_submission_tasklet) {
4976 : 0 : err = -ENODEV;
4977 : 0 : goto err_put;
4978 : : }
4979 : :
4980 : 0 : GEM_BUG_ON(RB_EMPTY_NODE(&ve->nodes[sibling->id].rb));
4981 : 0 : RB_CLEAR_NODE(&ve->nodes[sibling->id].rb);
4982 : :
4983 : 0 : ve->siblings[ve->num_siblings++] = sibling;
4984 : 0 : ve->base.mask |= sibling->mask;
4985 : :
4986 : : /*
4987 : : * All physical engines must be compatible for their emission
4988 : : * functions (as we build the instructions during request
4989 : : * construction and do not alter them before submission
4990 : : * on the physical engine). We use the engine class as a guide
4991 : : * here, although that could be refined.
4992 : : */
4993 [ # # ]: 0 : if (ve->base.class != OTHER_CLASS) {
4994 [ # # ]: 0 : if (ve->base.class != sibling->class) {
4995 : 0 : DRM_DEBUG("invalid mixing of engine class, sibling %d, already %d\n",
4996 : : sibling->class, ve->base.class);
4997 : 0 : err = -EINVAL;
4998 : 0 : goto err_put;
4999 : : }
5000 : 0 : continue;
5001 : : }
5002 : :
5003 : 0 : ve->base.class = sibling->class;
5004 : 0 : ve->base.uabi_class = sibling->uabi_class;
5005 : 0 : snprintf(ve->base.name, sizeof(ve->base.name),
5006 : : "v%dx%d", ve->base.class, count);
5007 : 0 : ve->base.context_size = sibling->context_size;
5008 : :
5009 : 0 : ve->base.emit_bb_start = sibling->emit_bb_start;
5010 : 0 : ve->base.emit_flush = sibling->emit_flush;
5011 : 0 : ve->base.emit_init_breadcrumb = sibling->emit_init_breadcrumb;
5012 : 0 : ve->base.emit_fini_breadcrumb = sibling->emit_fini_breadcrumb;
5013 : 0 : ve->base.emit_fini_breadcrumb_dw =
5014 : 0 : sibling->emit_fini_breadcrumb_dw;
5015 : :
5016 : 0 : ve->base.flags = sibling->flags;
5017 : : }
5018 : :
5019 : 0 : ve->base.flags |= I915_ENGINE_IS_VIRTUAL;
5020 : :
5021 : 0 : return &ve->context;
5022 : :
5023 : 0 : err_put:
5024 : 0 : intel_context_put(&ve->context);
5025 : 0 : return ERR_PTR(err);
5026 : : }
5027 : :
5028 : : struct intel_context *
5029 : 0 : intel_execlists_clone_virtual(struct intel_engine_cs *src)
5030 : : {
5031 : 0 : struct virtual_engine *se = to_virtual_engine(src);
5032 : 0 : struct intel_context *dst;
5033 : :
5034 : 0 : dst = intel_execlists_create_virtual(se->siblings,
5035 : : se->num_siblings);
5036 [ # # ]: 0 : if (IS_ERR(dst))
5037 : : return dst;
5038 : :
5039 [ # # ]: 0 : if (se->num_bonds) {
5040 : 0 : struct virtual_engine *de = to_virtual_engine(dst->engine);
5041 : :
5042 : 0 : de->bonds = kmemdup(se->bonds,
5043 : 0 : sizeof(*se->bonds) * se->num_bonds,
5044 : : GFP_KERNEL);
5045 [ # # ]: 0 : if (!de->bonds) {
5046 : 0 : intel_context_put(dst);
5047 : 0 : return ERR_PTR(-ENOMEM);
5048 : : }
5049 : :
5050 : 0 : de->num_bonds = se->num_bonds;
5051 : : }
5052 : :
5053 : : return dst;
5054 : : }
5055 : :
5056 : 0 : int intel_virtual_engine_attach_bond(struct intel_engine_cs *engine,
5057 : : const struct intel_engine_cs *master,
5058 : : const struct intel_engine_cs *sibling)
5059 : : {
5060 : 0 : struct virtual_engine *ve = to_virtual_engine(engine);
5061 : 0 : struct ve_bond *bond;
5062 : 0 : int n;
5063 : :
5064 : : /* Sanity check the sibling is part of the virtual engine */
5065 [ # # ]: 0 : for (n = 0; n < ve->num_siblings; n++)
5066 [ # # ]: 0 : if (sibling == ve->siblings[n])
5067 : : break;
5068 [ # # ]: 0 : if (n == ve->num_siblings)
5069 : : return -EINVAL;
5070 : :
5071 : 0 : bond = virtual_find_bond(ve, master);
5072 [ # # ]: 0 : if (bond) {
5073 : 0 : bond->sibling_mask |= sibling->mask;
5074 : 0 : return 0;
5075 : : }
5076 : :
5077 : 0 : bond = krealloc(ve->bonds,
5078 : 0 : sizeof(*bond) * (ve->num_bonds + 1),
5079 : : GFP_KERNEL);
5080 [ # # ]: 0 : if (!bond)
5081 : : return -ENOMEM;
5082 : :
5083 : 0 : bond[ve->num_bonds].master = master;
5084 : 0 : bond[ve->num_bonds].sibling_mask = sibling->mask;
5085 : :
5086 : 0 : ve->bonds = bond;
5087 : 0 : ve->num_bonds++;
5088 : :
5089 : 0 : return 0;
5090 : : }
5091 : :
5092 : : struct intel_engine_cs *
5093 : 0 : intel_virtual_engine_get_sibling(struct intel_engine_cs *engine,
5094 : : unsigned int sibling)
5095 : : {
5096 : 0 : struct virtual_engine *ve = to_virtual_engine(engine);
5097 : :
5098 [ # # ]: 0 : if (sibling >= ve->num_siblings)
5099 : : return NULL;
5100 : :
5101 : 0 : return ve->siblings[sibling];
5102 : : }
5103 : :
5104 : 0 : void intel_execlists_show_requests(struct intel_engine_cs *engine,
5105 : : struct drm_printer *m,
5106 : : void (*show_request)(struct drm_printer *m,
5107 : : struct i915_request *rq,
5108 : : const char *prefix),
5109 : : unsigned int max)
5110 : : {
5111 : 0 : const struct intel_engine_execlists *execlists = &engine->execlists;
5112 : 0 : struct i915_request *rq, *last;
5113 : 0 : unsigned long flags;
5114 : 0 : unsigned int count;
5115 : 0 : struct rb_node *rb;
5116 : :
5117 : 0 : spin_lock_irqsave(&engine->active.lock, flags);
5118 : :
5119 : 0 : last = NULL;
5120 : 0 : count = 0;
5121 [ # # ]: 0 : list_for_each_entry(rq, &engine->active.requests, sched.link) {
5122 [ # # ]: 0 : if (count++ < max - 1)
5123 : 0 : show_request(m, rq, "\t\tE ");
5124 : : else
5125 : : last = rq;
5126 : : }
5127 [ # # ]: 0 : if (last) {
5128 [ # # ]: 0 : if (count > max) {
5129 : 0 : drm_printf(m,
5130 : : "\t\t...skipping %d executing requests...\n",
5131 : : count - max);
5132 : : }
5133 : 0 : show_request(m, last, "\t\tE ");
5134 : : }
5135 : :
5136 : 0 : last = NULL;
5137 : 0 : count = 0;
5138 [ # # ]: 0 : if (execlists->queue_priority_hint != INT_MIN)
5139 : 0 : drm_printf(m, "\t\tQueue priority hint: %d\n",
5140 : : execlists->queue_priority_hint);
5141 [ # # ]: 0 : for (rb = rb_first_cached(&execlists->queue); rb; rb = rb_next(rb)) {
5142 : : struct i915_priolist *p = rb_entry(rb, typeof(*p), node);
5143 : : int i;
5144 : :
5145 [ # # # # ]: 0 : priolist_for_each_request(rq, p, i) {
5146 [ # # ]: 0 : if (count++ < max - 1)
5147 : 0 : show_request(m, rq, "\t\tQ ");
5148 : : else
5149 : : last = rq;
5150 : : }
5151 : : }
5152 [ # # ]: 0 : if (last) {
5153 [ # # ]: 0 : if (count > max) {
5154 : 0 : drm_printf(m,
5155 : : "\t\t...skipping %d queued requests...\n",
5156 : : count - max);
5157 : : }
5158 : 0 : show_request(m, last, "\t\tQ ");
5159 : : }
5160 : :
5161 : 0 : last = NULL;
5162 : 0 : count = 0;
5163 [ # # ]: 0 : for (rb = rb_first_cached(&execlists->virtual); rb; rb = rb_next(rb)) {
5164 : 0 : struct virtual_engine *ve =
5165 : 0 : rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
5166 [ # # ]: 0 : struct i915_request *rq = READ_ONCE(ve->request);
5167 : :
5168 [ # # ]: 0 : if (rq) {
5169 [ # # ]: 0 : if (count++ < max - 1)
5170 : 0 : show_request(m, rq, "\t\tV ");
5171 : : else
5172 : : last = rq;
5173 : : }
5174 : : }
5175 [ # # ]: 0 : if (last) {
5176 [ # # ]: 0 : if (count > max) {
5177 : 0 : drm_printf(m,
5178 : : "\t\t...skipping %d virtual requests...\n",
5179 : : count - max);
5180 : : }
5181 : 0 : show_request(m, last, "\t\tV ");
5182 : : }
5183 : :
5184 : 0 : spin_unlock_irqrestore(&engine->active.lock, flags);
5185 : 0 : }
5186 : :
5187 : 0 : void intel_lr_context_reset(struct intel_engine_cs *engine,
5188 : : struct intel_context *ce,
5189 : : u32 head,
5190 : : bool scrub)
5191 : : {
5192 : 0 : GEM_BUG_ON(!intel_context_is_pinned(ce));
5193 : :
5194 : : /*
5195 : : * We want a simple context + ring to execute the breadcrumb update.
5196 : : * We cannot rely on the context being intact across the GPU hang,
5197 : : * so clear it and rebuild just what we need for the breadcrumb.
5198 : : * All pending requests for this context will be zapped, and any
5199 : : * future request will be after userspace has had the opportunity
5200 : : * to recreate its own state.
5201 : : */
5202 [ # # ]: 0 : if (scrub)
5203 : 0 : restore_default_state(ce, engine);
5204 : :
5205 : : /* Rerun the request; its payload has been neutered (if guilty). */
5206 : 0 : __execlists_update_reg_state(ce, engine, head);
5207 : 0 : }
5208 : :
5209 : : bool
5210 : 0 : intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine)
5211 : : {
5212 : 0 : return engine->set_default_submission ==
5213 : : intel_execlists_set_default_submission;
5214 : : }
5215 : :
5216 : : #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
5217 : : #include "selftest_lrc.c"
5218 : : #endif
|