Skip to content
24 changes: 12 additions & 12 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -645,17 +645,17 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
biasPhysReg(Cand.SU, Cand.AtTop), TryCand, Cand, PhysReg))
return TryCand.Reason != NoCand;

// Avoid exceeding the target's limit.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;
if (DAG->isTrackingPressure()) {
// Avoid exceeding the target's limit.
if (tryPressure(TryCand.RPDelta.Excess, Cand.RPDelta.Excess, TryCand, Cand,
RegExcess, TRI, DAG->MF))
return TryCand.Reason != NoCand;

// Avoid increasing the max critical pressure in the scheduled region.
if (DAG->isTrackingPressure() &&
tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
// Avoid increasing the max critical pressure in the scheduled region.
if (tryPressure(TryCand.RPDelta.CriticalMax, Cand.RPDelta.CriticalMax,
TryCand, Cand, RegCritical, TRI, DAG->MF))
return TryCand.Reason != NoCand;
}

// MaxMemoryClause-specific: We prioritize clustered instructions as we would
// get more benefit from clausing these memory instructions.
Expand Down Expand Up @@ -737,8 +737,8 @@ bool GCNMaxMemoryClauseSchedStrategy::tryCandidate(SchedCandidate &Cand,
return TryCand.Reason != NoCand;

// Fall through to original instruction order.
if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum) ||
(!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
if ((Zone->isTop() == (TryCand.SU->NodeNum < Cand.SU->NodeNum))) {
assert(TryCand.SU->NodeNum != Cand.SU->NodeNum);
TryCand.Reason = NodeOrder;
return true;
}
Expand Down
224 changes: 97 additions & 127 deletions llvm/test/CodeGen/AMDGPU/group-image-instructions.ll
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
; RUN: llc -mtriple=amdgcn-amd-amdpa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s

define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 inreg noundef %userdata6, i32 inreg noundef %userdata7, i32 inreg noundef %userdata8, i32 inreg noundef %PrimMask, <2 x float> noundef %PerspInterpSample, <2 x float> noundef %PerspInterpCenter, <2 x float> noundef %PerspInterpCentroid) #2 {
; GFX11-LABEL: group_image_sample:
; GFX11: ; %bb.0: ; %.entry
; GFX11-NEXT: s_mov_b64 s[16:17], exec
; GFX11-NEXT: s_wqm_b64 exec, exec
; GFX11-NEXT: s_mov_b32 s16, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: s_mov_b32 m0, s4
; GFX11-NEXT: s_getpc_b64 s[4:5]
; GFX11-NEXT: s_mov_b32 s0, s1
Expand All @@ -16,11 +16,11 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: s_load_b128 s[12:15], s[0:1], 0x0
; GFX11-NEXT: s_load_b128 s[8:11], s[2:3], 0x0
; GFX11-NEXT: s_load_b256 s[0:7], s[6:7], 0x0
; GFX11-NEXT: s_mov_b64 s[18:19], exec
; GFX11-NEXT: s_wqm_b64 exec, exec
; GFX11-NEXT: s_mov_b32 s17, exec_lo
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
; GFX11-NEXT: lds_param_load v2, attr0.y wait_vdst:15
; GFX11-NEXT: lds_param_load v3, attr0.x wait_vdst:15
; GFX11-NEXT: s_mov_b64 exec, s[18:19]
; GFX11-NEXT: s_mov_b32 exec_lo, s17
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x10
Expand All @@ -30,21 +30,17 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: v_interp_p10_f32 v4, v2, v0, v2 wait_exp:1
; GFX11-NEXT: v_interp_p10_f32 v0, v3, v0, v3 wait_exp:0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_interp_p2_f32 v45, v2, v1, v4 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v44, v3, v1, v0 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v61, v2, v1, v4 wait_exp:7
; GFX11-NEXT: v_interp_p2_f32 v60, v3, v1, v0 wait_exp:7
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
; GFX11-NEXT: v_add_f32_e32 v8, s20, v44
; GFX11-NEXT: v_add_f32_e32 v9, s21, v45
; GFX11-NEXT: v_add_f32_e32 v16, s24, v44
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
; GFX11-NEXT: v_dual_add_f32 v8, s20, v60 :: v_dual_add_f32 v9, s21, v61
; GFX11-NEXT: v_dual_add_f32 v16, s24, v60 :: v_dual_add_f32 v17, s25, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[8:9], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
; GFX11-NEXT: v_add_f32_e32 v17, s25, v45
; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[12:15], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[16:19], v[16:17], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
Expand All @@ -54,135 +50,109 @@ define amdgpu_ps void @group_image_sample(i32 inreg noundef %globalTable, i32 in
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0x70
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0x80
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, s18, v44
; GFX11-NEXT: v_add_f32_e32 v1, s19, v45
; GFX11-NEXT: v_add_f32_e32 v24, s20, v44
; GFX11-NEXT: v_add_f32_e32 v25, s21, v45
; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
; GFX11-NEXT: v_dual_add_f32 v24, s20, v60 :: v_dual_add_f32 v25, s21, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[20:23], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[24:27], v[24:25], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x7
; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
; GFX11-NEXT: v_dual_add_f32 v32, s24, v60 :: v_dual_add_f32 v33, s25, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[28:31], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[32:35], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0x90
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xa0
; GFX11-NEXT: s_buffer_load_b64 s[26:27], s[12:15], 0xb0
; GFX11-NEXT: s_buffer_load_b64 s[28:29], s[12:15], 0xc0
; GFX11-NEXT: s_buffer_load_b64 s[30:31], s[12:15], 0xd0
; GFX11-NEXT: s_buffer_load_b64 s[34:35], s[12:15], 0xe0
; GFX11-NEXT: s_buffer_load_b64 s[36:37], s[12:15], 0xf0
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
; GFX11-NEXT: v_add_f32_e32 v0, s22, v44
; GFX11-NEXT: v_add_f32_e32 v1, s23, v45
; GFX11-NEXT: v_add_f32_e32 v28, s24, v44
; GFX11-NEXT: v_add_f32_e32 v29, s25, v45
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xb0
; GFX11-NEXT: s_buffer_load_b64 s[24:25], s[12:15], 0xc0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v30, s18, v44
; GFX11-NEXT: v_add_f32_e32 v31, s19, v45
; GFX11-NEXT: v_add_f32_e32 v32, s20, v44
; GFX11-NEXT: v_add_f32_e32 v33, s21, v45
; GFX11-NEXT: v_add_f32_e32 v34, s26, v44
; GFX11-NEXT: v_add_f32_e32 v35, s27, v45
; GFX11-NEXT: v_add_f32_e32 v36, s28, v44
; GFX11-NEXT: v_add_f32_e32 v37, s29, v45
; GFX11-NEXT: v_add_f32_e32 v38, s30, v44
; GFX11-NEXT: v_add_f32_e32 v39, s31, v45
; GFX11-NEXT: v_add_f32_e32 v40, s34, v44
; GFX11-NEXT: v_add_f32_e32 v41, s35, v45
; GFX11-NEXT: v_add_f32_e32 v42, s36, v44
; GFX11-NEXT: v_add_f32_e32 v43, s37, v45
; GFX11-NEXT: v_add_f32_e32 v44, s12, v44
; GFX11-NEXT: v_add_f32_e32 v45, s13, v45
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_f32_e32 v46, v8, v4
; GFX11-NEXT: v_add_f32_e32 v47, v9, v5
; GFX11-NEXT: v_add_f32_e32 v48, v10, v6
; GFX11-NEXT: v_add_f32_e32 v49, v11, v7
; GFX11-NEXT: s_and_b64 exec, exec, s[16:17]
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[4:7], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[8:11], v[28:29], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_add_f32_e32 v0, v12, v46
; GFX11-NEXT: v_add_f32_e32 v1, v13, v47
; GFX11-NEXT: v_add_f32_e32 v46, v14, v48
; GFX11-NEXT: v_add_f32_e32 v47, v15, v49
; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
; GFX11-NEXT: v_dual_add_f32 v40, s20, v60 :: v_dual_add_f32 v41, s21, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[12:15], v[30:31], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[28:31], v[32:33], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
; GFX11-NEXT: v_add_f32_e32 v46, v18, v46
; GFX11-NEXT: v_add_f32_e32 v47, v19, v47
; GFX11-NEXT: image_sample v[36:39], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[40:43], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
; GFX11-NEXT: v_dual_add_f32 v48, s24, v60 :: v_dual_add_f32 v49, s25, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[16:19], v[34:35], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[32:35], v[36:37], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
; GFX11-NEXT: v_add_f32_e32 v46, v22, v46
; GFX11-NEXT: v_add_f32_e32 v47, v23, v47
; GFX11-NEXT: image_sample v[44:47], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[48:51], v[48:49], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_clause 0x3
; GFX11-NEXT: s_buffer_load_b64 s[18:19], s[12:15], 0xd0
; GFX11-NEXT: s_buffer_load_b64 s[20:21], s[12:15], 0xe0
; GFX11-NEXT: s_buffer_load_b64 s[22:23], s[12:15], 0xf0
; GFX11-NEXT: s_buffer_load_b64 s[12:13], s[12:15], 0x100
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_add_f32 v0, s18, v60 :: v_dual_add_f32 v1, s19, v61
; GFX11-NEXT: v_dual_add_f32 v56, s20, v60 :: v_dual_add_f32 v57, s21, v61
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[20:23], v[38:39], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[36:39], v[40:41], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
; GFX11-NEXT: v_add_f32_e32 v46, v26, v46
; GFX11-NEXT: v_add_f32_e32 v47, v27, v47
; GFX11-NEXT: image_sample v[52:55], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[56:59], v[56:57], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: v_dual_add_f32 v0, s22, v60 :: v_dual_add_f32 v1, s23, v61
; GFX11-NEXT: v_dual_add_f32 v64, s12, v60 :: v_dual_add_f32 v65, s13, v61
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s16
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: image_sample v[24:27], v[42:43], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[40:43], v[44:45], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[60:63], v[0:1], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: image_sample v[64:67], v[64:65], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
; GFX11-NEXT: s_waitcnt vmcnt(14)
; GFX11-NEXT: v_dual_add_f32 v0, v8, v4 :: v_dual_add_f32 v1, v9, v5
; GFX11-NEXT: v_dual_add_f32 v4, v10, v6 :: v_dual_add_f32 v5, v11, v7
; GFX11-NEXT: s_waitcnt vmcnt(13)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v12, v0 :: v_dual_add_f32 v1, v13, v1
; GFX11-NEXT: v_dual_add_f32 v4, v14, v4 :: v_dual_add_f32 v5, v15, v5
; GFX11-NEXT: s_waitcnt vmcnt(12)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v16, v0 :: v_dual_add_f32 v1, v17, v1
; GFX11-NEXT: v_dual_add_f32 v4, v18, v4 :: v_dual_add_f32 v5, v19, v5
; GFX11-NEXT: s_waitcnt vmcnt(11)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v20, v0 :: v_dual_add_f32 v1, v21, v1
; GFX11-NEXT: v_dual_add_f32 v4, v22, v4 :: v_dual_add_f32 v5, v23, v5
; GFX11-NEXT: s_waitcnt vmcnt(10)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v24, v0 :: v_dual_add_f32 v1, v25, v1
; GFX11-NEXT: v_dual_add_f32 v4, v26, v4 :: v_dual_add_f32 v5, v27, v5
; GFX11-NEXT: s_waitcnt vmcnt(9)
; GFX11-NEXT: v_add_f32_e32 v0, v4, v0
; GFX11-NEXT: v_add_f32_e32 v1, v5, v1
; GFX11-NEXT: v_add_f32_e32 v4, v6, v46
; GFX11-NEXT: v_add_f32_e32 v5, v7, v47
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v28, v0 :: v_dual_add_f32 v1, v29, v1
; GFX11-NEXT: v_dual_add_f32 v4, v30, v4 :: v_dual_add_f32 v5, v31, v5
; GFX11-NEXT: s_waitcnt vmcnt(8)
; GFX11-NEXT: v_add_f32_e32 v0, v8, v0
; GFX11-NEXT: v_add_f32_e32 v1, v9, v1
; GFX11-NEXT: v_add_f32_e32 v4, v10, v4
; GFX11-NEXT: v_add_f32_e32 v5, v11, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v32, v0 :: v_dual_add_f32 v1, v33, v1
; GFX11-NEXT: v_dual_add_f32 v4, v34, v4 :: v_dual_add_f32 v5, v35, v5
; GFX11-NEXT: s_waitcnt vmcnt(7)
; GFX11-NEXT: v_add_f32_e32 v0, v12, v0
; GFX11-NEXT: v_add_f32_e32 v1, v13, v1
; GFX11-NEXT: v_add_f32_e32 v4, v14, v4
; GFX11-NEXT: v_add_f32_e32 v5, v15, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v36, v0 :: v_dual_add_f32 v1, v37, v1
; GFX11-NEXT: v_dual_add_f32 v4, v38, v4 :: v_dual_add_f32 v5, v39, v5
; GFX11-NEXT: s_waitcnt vmcnt(6)
; GFX11-NEXT: v_add_f32_e32 v0, v28, v0
; GFX11-NEXT: v_add_f32_e32 v1, v29, v1
; GFX11-NEXT: v_add_f32_e32 v4, v30, v4
; GFX11-NEXT: v_add_f32_e32 v5, v31, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v40, v0 :: v_dual_add_f32 v1, v41, v1
; GFX11-NEXT: v_dual_add_f32 v4, v42, v4 :: v_dual_add_f32 v5, v43, v5
; GFX11-NEXT: s_waitcnt vmcnt(5)
; GFX11-NEXT: v_add_f32_e32 v0, v16, v0
; GFX11-NEXT: v_add_f32_e32 v1, v17, v1
; GFX11-NEXT: v_add_f32_e32 v4, v18, v4
; GFX11-NEXT: v_add_f32_e32 v5, v19, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v44, v0 :: v_dual_add_f32 v1, v45, v1
; GFX11-NEXT: v_dual_add_f32 v4, v46, v4 :: v_dual_add_f32 v5, v47, v5
; GFX11-NEXT: s_waitcnt vmcnt(4)
; GFX11-NEXT: v_add_f32_e32 v0, v32, v0
; GFX11-NEXT: v_add_f32_e32 v1, v33, v1
; GFX11-NEXT: v_add_f32_e32 v4, v34, v4
; GFX11-NEXT: v_add_f32_e32 v5, v35, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v48, v0 :: v_dual_add_f32 v1, v49, v1
; GFX11-NEXT: v_dual_add_f32 v4, v50, v4 :: v_dual_add_f32 v5, v51, v5
; GFX11-NEXT: s_waitcnt vmcnt(3)
; GFX11-NEXT: v_add_f32_e32 v0, v20, v0
; GFX11-NEXT: v_add_f32_e32 v1, v21, v1
; GFX11-NEXT: v_add_f32_e32 v4, v22, v4
; GFX11-NEXT: v_add_f32_e32 v5, v23, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v52, v0 :: v_dual_add_f32 v1, v53, v1
; GFX11-NEXT: v_dual_add_f32 v4, v54, v4 :: v_dual_add_f32 v5, v55, v5
; GFX11-NEXT: s_waitcnt vmcnt(2)
; GFX11-NEXT: v_add_f32_e32 v0, v36, v0
; GFX11-NEXT: v_add_f32_e32 v1, v37, v1
; GFX11-NEXT: v_add_f32_e32 v4, v38, v4
; GFX11-NEXT: v_add_f32_e32 v5, v39, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v56, v0 :: v_dual_add_f32 v1, v57, v1
; GFX11-NEXT: v_dual_add_f32 v4, v58, v4 :: v_dual_add_f32 v5, v59, v5
; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_add_f32_e32 v0, v24, v0
; GFX11-NEXT: v_add_f32_e32 v1, v25, v1
; GFX11-NEXT: v_add_f32_e32 v4, v26, v4
; GFX11-NEXT: v_add_f32_e32 v5, v27, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v60, v0 :: v_dual_add_f32 v1, v61, v1
; GFX11-NEXT: v_dual_add_f32 v4, v62, v4 :: v_dual_add_f32 v5, v63, v5
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_add_f32_e32 v0, v40, v0
; GFX11-NEXT: v_add_f32_e32 v1, v41, v1
; GFX11-NEXT: v_add_f32_e32 v4, v42, v4
; GFX11-NEXT: v_add_f32_e32 v5, v43, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_dual_add_f32 v0, v64, v0 :: v_dual_add_f32 v1, v65, v1
; GFX11-NEXT: v_dual_add_f32 v4, v66, v4 :: v_dual_add_f32 v5, v67, v5
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v0, v0, v1
; GFX11-NEXT: v_cvt_pk_rtz_f16_f32_e32 v1, v4, v5
; GFX11-NEXT: exp mrt0 v0, v1, off, off done
Expand Down Expand Up @@ -479,7 +449,7 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #3
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #3
declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32 immarg) #8

attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause" "InitialPSInputAddr"="2" "amdgpu-color-export"="1" "amdgpu-depth-export"="0" "amdgpu-memory-bound"="false" "amdgpu-prealloc-sgpr-spill-vgprs" "amdgpu-unroll-threshold"="700" "amdgpu-wave-limiter"="false" "denormal-fp-math-f32"="preserve-sign" "target-features"=",+wavefrontsize64,-cumode" }
attributes #2 = { alwaysinline nounwind memory(readwrite) "amdgpu-sched-strategy"="max-memory-clause"}
attributes #3 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #4 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
attributes #5 = { nocallback nofree nosync nounwind willreturn memory(read) }
Expand Down
Loading