77#include "i915_drv.h"
88#include "intel_gpu_commands.h"
99
10- #define MAX_URB_ENTRIES 64
11- #define STATE_SIZE (4 * 1024)
1210#define GT3_INLINE_DATA_DELAYS 0x1E00
1311#define batch_advance (Y , CS ) GEM_BUG_ON((Y)->end != (CS))
1412
@@ -34,38 +32,59 @@ struct batch_chunk {
3432};
3533
3634struct batch_vals {
37- u32 max_primitives ;
38- u32 max_urb_entries ;
39- u32 cmd_size ;
40- u32 state_size ;
35+ u32 max_threads ;
4136 u32 state_start ;
42- u32 batch_size ;
37+ u32 surface_start ;
4338 u32 surface_height ;
4439 u32 surface_width ;
45- u32 scratch_size ;
46- u32 max_size ;
40+ u32 size ;
4741};
4842
43+ static inline int num_primitives (const struct batch_vals * bv )
44+ {
45+ /*
46+ * We need to saturate the GPU with work in order to dispatch
47+ * a shader on every HW thread, and clear the thread-local registers.
48+ * In short, we have to dispatch work faster than the shaders can
49+ * run in order to fill the EU and occupy each HW thread.
50+ */
51+ return bv -> max_threads ;
52+ }
53+
4954static void
5055batch_get_defaults (struct drm_i915_private * i915 , struct batch_vals * bv )
5156{
5257 if (IS_HASWELL (i915 )) {
53- bv -> max_primitives = 280 ;
54- bv -> max_urb_entries = MAX_URB_ENTRIES ;
58+ switch (INTEL_INFO (i915 )-> gt ) {
59+ default :
60+ case 1 :
61+ bv -> max_threads = 70 ;
62+ break ;
63+ case 2 :
64+ bv -> max_threads = 140 ;
65+ break ;
66+ case 3 :
67+ bv -> max_threads = 280 ;
68+ break ;
69+ }
5570 bv -> surface_height = 16 * 16 ;
5671 bv -> surface_width = 32 * 2 * 16 ;
5772 } else {
58- bv -> max_primitives = 128 ;
59- bv -> max_urb_entries = MAX_URB_ENTRIES / 2 ;
73+ switch (INTEL_INFO (i915 )-> gt ) {
74+ default :
75+ case 1 : /* including vlv */
76+ bv -> max_threads = 36 ;
77+ break ;
78+ case 2 :
79+ bv -> max_threads = 128 ;
80+ break ;
81+ }
6082 bv -> surface_height = 16 * 8 ;
6183 bv -> surface_width = 32 * 16 ;
6284 }
63- bv -> cmd_size = bv -> max_primitives * 4096 ;
64- bv -> state_size = STATE_SIZE ;
65- bv -> state_start = bv -> cmd_size ;
66- bv -> batch_size = bv -> cmd_size + bv -> state_size ;
67- bv -> scratch_size = bv -> surface_height * bv -> surface_width ;
68- bv -> max_size = bv -> batch_size + bv -> scratch_size ;
85+ bv -> state_start = round_up (SZ_1K + num_primitives (bv ) * 64 , SZ_4K );
86+ bv -> surface_start = bv -> state_start + SZ_4K ;
87+ bv -> size = bv -> surface_start + bv -> surface_height * bv -> surface_width ;
6988}
7089
7190static void batch_init (struct batch_chunk * bc ,
@@ -155,7 +174,8 @@ static u32
155174gen7_fill_binding_table (struct batch_chunk * state ,
156175 const struct batch_vals * bv )
157176{
158- u32 surface_start = gen7_fill_surface_state (state , bv -> batch_size , bv );
177+ u32 surface_start =
178+ gen7_fill_surface_state (state , bv -> surface_start , bv );
159179 u32 * cs = batch_alloc_items (state , 32 , 8 );
160180 u32 offset = batch_offset (state , cs );
161181
@@ -214,9 +234,9 @@ static void
214234gen7_emit_state_base_address (struct batch_chunk * batch ,
215235 u32 surface_state_base )
216236{
217- u32 * cs = batch_alloc_items (batch , 0 , 12 );
237+ u32 * cs = batch_alloc_items (batch , 0 , 10 );
218238
219- * cs ++ = STATE_BASE_ADDRESS | (12 - 2 );
239+ * cs ++ = STATE_BASE_ADDRESS | (10 - 2 );
220240 /* general */
221241 * cs ++ = batch_addr (batch ) | BASE_ADDRESS_MODIFY ;
222242 /* surface */
@@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
233253 * cs ++ = BASE_ADDRESS_MODIFY ;
234254 * cs ++ = 0 ;
235255 * cs ++ = BASE_ADDRESS_MODIFY ;
236- * cs ++ = 0 ;
237- * cs ++ = 0 ;
238256 batch_advance (batch , cs );
239257}
240258
@@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
244262 u32 urb_size , u32 curbe_size ,
245263 u32 mode )
246264{
247- u32 urb_entries = bv -> max_urb_entries ;
248- u32 threads = bv -> max_primitives - 1 ;
265+ u32 threads = bv -> max_threads - 1 ;
249266 u32 * cs = batch_alloc_items (batch , 32 , 8 );
250267
251268 * cs ++ = MEDIA_VFE_STATE | (8 - 2 );
@@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
254271 * cs ++ = 0 ;
255272
256273 /* number of threads & urb entries for GPGPU vs Media Mode */
257- * cs ++ = threads << 16 | urb_entries << 8 | mode << 2 ;
274+ * cs ++ = threads << 16 | 1 << 8 | mode << 2 ;
258275
259276 * cs ++ = 0 ;
260277
@@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
293310{
294311 unsigned int x_offset = (media_object_index % 16 ) * 64 ;
295312 unsigned int y_offset = (media_object_index / 16 ) * 16 ;
296- unsigned int inline_data_size ;
297- unsigned int media_batch_size ;
298- unsigned int i ;
313+ unsigned int pkt = 6 + 3 ;
299314 u32 * cs ;
300315
301- inline_data_size = 112 * 8 ;
302- media_batch_size = inline_data_size + 6 ;
303-
304- cs = batch_alloc_items (batch , 8 , media_batch_size );
316+ cs = batch_alloc_items (batch , 8 , pkt );
305317
306- * cs ++ = MEDIA_OBJECT | (media_batch_size - 2 );
318+ * cs ++ = MEDIA_OBJECT | (pkt - 2 );
307319
308320 /* interface descriptor offset */
309321 * cs ++ = 0 ;
@@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
317329 * cs ++ = 0 ;
318330
319331 /* inline */
320- * cs ++ = ( y_offset << 16 ) | ( x_offset ) ;
332+ * cs ++ = y_offset << 16 | x_offset ;
321333 * cs ++ = 0 ;
322334 * cs ++ = GT3_INLINE_DATA_DELAYS ;
323- for (i = 3 ; i < inline_data_size ; i ++ )
324- * cs ++ = 0 ;
325335
326336 batch_advance (batch , cs );
327337}
328338
329339static void gen7_emit_pipeline_flush (struct batch_chunk * batch )
330340{
331- u32 * cs = batch_alloc_items (batch , 0 , 5 );
341+ u32 * cs = batch_alloc_items (batch , 0 , 4 );
332342
333- * cs ++ = GFX_OP_PIPE_CONTROL (5 );
334- * cs ++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE |
335- PIPE_CONTROL_GLOBAL_GTT_IVB ;
343+ * cs ++ = GFX_OP_PIPE_CONTROL (4 );
344+ * cs ++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
345+ PIPE_CONTROL_DEPTH_CACHE_FLUSH |
346+ PIPE_CONTROL_DC_FLUSH_ENABLE |
347+ PIPE_CONTROL_CS_STALL ;
336348 * cs ++ = 0 ;
337349 * cs ++ = 0 ;
350+
351+ batch_advance (batch , cs );
352+ }
353+
354+ static void gen7_emit_pipeline_invalidate (struct batch_chunk * batch )
355+ {
356+ u32 * cs = batch_alloc_items (batch , 0 , 8 );
357+
358+ /* ivb: Stall before STATE_CACHE_INVALIDATE */
359+ * cs ++ = GFX_OP_PIPE_CONTROL (4 );
360+ * cs ++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
361+ PIPE_CONTROL_CS_STALL ;
362+ * cs ++ = 0 ;
363+ * cs ++ = 0 ;
364+
365+ * cs ++ = GFX_OP_PIPE_CONTROL (4 );
366+ * cs ++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE ;
338367 * cs ++ = 0 ;
368+ * cs ++ = 0 ;
369+
339370 batch_advance (batch , cs );
340371}
341372
@@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
344375 const struct batch_vals * bv )
345376{
346377 struct drm_i915_private * i915 = vma -> vm -> i915 ;
347- unsigned int desc_count = 64 ;
348- const u32 urb_size = 112 ;
378+ const unsigned int desc_count = 1 ;
379+ const unsigned int urb_size = 1 ;
349380 struct batch_chunk cmds , state ;
350- u32 interface_descriptor ;
381+ u32 descriptors ;
351382 unsigned int i ;
352383
353- batch_init (& cmds , vma , start , 0 , bv -> cmd_size );
354- batch_init (& state , vma , start , bv -> state_start , bv -> state_size );
384+ batch_init (& cmds , vma , start , 0 , bv -> state_start );
385+ batch_init (& state , vma , start , bv -> state_start , SZ_4K );
355386
356- interface_descriptor =
357- gen7_fill_interface_descriptor ( & state , bv ,
358- IS_HASWELL ( i915 ) ?
359- & cb_kernel_hsw :
360- & cb_kernel_ivb ,
361- desc_count );
362- gen7_emit_pipeline_flush (& cmds );
387+ descriptors = gen7_fill_interface_descriptor ( & state , bv ,
388+ IS_HASWELL ( i915 ) ?
389+ & cb_kernel_hsw :
390+ & cb_kernel_ivb ,
391+ desc_count );
392+
393+ gen7_emit_pipeline_invalidate (& cmds );
363394 batch_add (& cmds , PIPELINE_SELECT | PIPELINE_SELECT_MEDIA );
364395 batch_add (& cmds , MI_NOOP );
365- gen7_emit_state_base_address (& cmds , interface_descriptor );
396+ gen7_emit_pipeline_invalidate (& cmds );
397+
366398 gen7_emit_pipeline_flush (& cmds );
399+ gen7_emit_state_base_address (& cmds , descriptors );
400+ gen7_emit_pipeline_invalidate (& cmds );
367401
368402 gen7_emit_vfe_state (& cmds , bv , urb_size - 1 , 0 , 0 );
403+ gen7_emit_interface_descriptor_load (& cmds , descriptors , desc_count );
369404
370- gen7_emit_interface_descriptor_load (& cmds ,
371- interface_descriptor ,
372- desc_count );
373-
374- for (i = 0 ; i < bv -> max_primitives ; i ++ )
405+ for (i = 0 ; i < num_primitives (bv ); i ++ )
375406 gen7_emit_media_object (& cmds , i );
376407
377408 batch_add (& cmds , MI_BATCH_BUFFER_END );
@@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
385416
386417 batch_get_defaults (engine -> i915 , & bv );
387418 if (!vma )
388- return bv .max_size ;
419+ return bv .size ;
389420
390- GEM_BUG_ON (vma -> obj -> base .size < bv .max_size );
421+ GEM_BUG_ON (vma -> obj -> base .size < bv .size );
391422
392423 batch = i915_gem_object_pin_map (vma -> obj , I915_MAP_WC );
393424 if (IS_ERR (batch ))
394425 return PTR_ERR (batch );
395426
396- emit_batch (vma , memset (batch , 0 , bv .max_size ), & bv );
427+ emit_batch (vma , memset (batch , 0 , bv .size ), & bv );
397428
398429 i915_gem_object_flush_map (vma -> obj );
399430 __i915_gem_object_release_map (vma -> obj );
0 commit comments