Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1829,8 +1829,8 @@ void AArch64AsmPrinter::emitMOVK(Register Dest, uint64_t Imm, unsigned Shift) {

void AArch64AsmPrinter::emitFMov0(const MachineInstr &MI) {
Register DestReg = MI.getOperand(0).getReg();
if (STI->hasZeroCycleZeroingFP() && !STI->hasZeroCycleZeroingFPWorkaround() &&
STI->isNeonAvailable()) {
if (STI->hasZeroCycleZeroingFPR64() &&
!STI->hasZeroCycleZeroingFPWorkaround() && STI->isNeonAvailable()) {
// Convert H/S register to corresponding D register
if (AArch64::H0 <= DestReg && DestReg <= AArch64::H31)
DestReg = AArch64::D0 + (DestReg - AArch64::H0);
Expand Down
15 changes: 7 additions & 8 deletions llvm/lib/Target/AArch64/AArch64Features.td
Original file line number Diff line number Diff line change
Expand Up @@ -630,19 +630,18 @@ def FeatureZCRegMoveFPR64 : SubtargetFeature<"zcm-fpr64", "HasZeroCycleRegMoveFP
def FeatureZCRegMoveFPR32 : SubtargetFeature<"zcm-fpr32", "HasZeroCycleRegMoveFPR32", "true",
"Has zero-cycle register moves for FPR32 registers">;

def FeatureZCZeroingGP : SubtargetFeature<"zcz-gp", "HasZeroCycleZeroingGP", "true",
"Has zero-cycle zeroing instructions for generic registers">;
def FeatureZCZeroingGPR64 : SubtargetFeature<"zcz-gpr64", "HasZeroCycleZeroingGPR64", "true",
"Has zero-cycle zeroing instructions for GPR64 registers">;

def FeatureZCZeroingGPR32 : SubtargetFeature<"zcz-gpr32", "HasZeroCycleZeroingGPR32", "true",
"Has zero-cycle zeroing instructions for GPR32 registers">;

// It is generally beneficial to rewrite "fmov s0, wzr" to "movi d0, #0".
// as movi is more efficient across all cores. Newer cores can eliminate
// fmovs early and there is no difference with movi, but this not true for
// all implementations.
def FeatureNoZCZeroingFP : SubtargetFeature<"no-zcz-fp", "HasZeroCycleZeroingFP", "false",
"Has no zero-cycle zeroing instructions for FP registers">;

def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
"Has zero-cycle zeroing instructions",
[FeatureZCZeroingGP]>;
def FeatureNoZCZeroingFPR64 : SubtargetFeature<"no-zcz-fpr64", "HasZeroCycleZeroingFPR64", "false",
"Has no zero-cycle zeroing instructions for FPR64 registers">;

/// ... but the floating-point version doesn't quite work in rare cases on older
/// CPUs.
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5075,7 +5075,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
}
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGP()) {
} else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroingGPR32()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
Expand Down Expand Up @@ -5202,7 +5202,7 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
.addReg(SrcReg, getKillRegState(KillSrc))
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGP()) {
} else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroingGPR64()) {
BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg)
.addImm(0)
.addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
Expand Down
43 changes: 28 additions & 15 deletions llvm/lib/Target/AArch64/AArch64Processors.td
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,8 @@ def TuneAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureZCZeroingFPWorkaround]>;

def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
Expand All @@ -336,7 +337,8 @@ def TuneAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
"Apple A11", [
Expand All @@ -349,7 +351,8 @@ def TuneAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
"Apple A12", [
Expand All @@ -362,7 +365,8 @@ def TuneAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
"Apple A13", [
Expand All @@ -375,7 +379,8 @@ def TuneAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
"Apple A14", [
Expand All @@ -393,7 +398,8 @@ def TuneAppleA14 : SubtargetFeature<"apple-a14", "ARMProcFamily", "AppleA14",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
"Apple A15", [
Expand All @@ -411,7 +417,8 @@ def TuneAppleA15 : SubtargetFeature<"apple-a15", "ARMProcFamily", "AppleA15",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
"Apple A16", [
Expand All @@ -429,7 +436,8 @@ def TuneAppleA16 : SubtargetFeature<"apple-a16", "ARMProcFamily", "AppleA16",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
"Apple A17", [
Expand All @@ -447,7 +455,8 @@ def TuneAppleA17 : SubtargetFeature<"apple-a17", "ARMProcFamily", "AppleA17",
FeatureStorePairSuppress,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
"Apple M4", [
Expand All @@ -464,8 +473,8 @@ def TuneAppleM4 : SubtargetFeature<"apple-m4", "ARMProcFamily", "AppleM4",
FeatureFuseLiterals,
FeatureZCRegMoveGPR64,
FeatureZCRegMoveFPR128,
FeatureZCZeroing
]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
Expand Down Expand Up @@ -497,21 +506,24 @@ def TuneExynosM4 : SubtargetFeature<"exynosm4", "ARMProcFamily", "ExynosM3",
FeatureStorePairSuppress,
FeatureALULSLFast,
FeaturePostRAScheduler,
FeatureZCZeroing]>;
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64]>;

def TuneKryo : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
"Qualcomm Kryo processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureALULSLFast,
FeatureStorePairSuppress]>;

def TuneFalkor : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
"Qualcomm Falkor processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast,
FeatureSlowSTRQro]>;
Expand Down Expand Up @@ -607,7 +619,8 @@ def TuneSaphira : SubtargetFeature<"saphira", "ARMProcFamily", "Saphira",
"Qualcomm Saphira processors", [
FeaturePostRAScheduler,
FeaturePredictableSelectIsExpensive,
FeatureZCZeroing,
FeatureZCZeroingGPR32,
FeatureZCZeroingGPR64,
FeatureStorePairSuppress,
FeatureALULSLFast]>;

Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,-zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-NO-ZCM-GPR64-NO-ZCZ %s
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,-zcz-gpr32,-zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-ZCM-GPR32-ZCM-GPR64-NO-ZCZ %s
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="-zcm-gpr32,-zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-NO-ZCM-ZCZ %s
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz" %s \
# RUN: llc -o - -mtriple=arm64-apple-ios -run-pass=postrapseudos -simplify-mir -verify-machineinstrs -mattr="+zcm-gpr32,+zcm-gpr64,+zcz-gpr32,+zcz-gpr64" %s \
# RUN: | FileCheck --check-prefix=CHECK-ZCM-ZCZ %s

--- |
Expand Down
153 changes: 153 additions & 0 deletions llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-fpr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+no-zcz-fpr64,+fullfp16 | FileCheck %s -check-prefixes=ALL,NOZCZ-FPR64-FULLFP16
; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+fullfp16 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,FP-WORKAROUND
; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64
; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-FPR64

define half @tf16() {
entry:
; ALL-LABEL: tf16:
; FP-WORKAROUND: mov s0, wzr
; NOZCZ-FPR64: mov s0, wzr
; NOZCZ-FPR64-FULLFP16: mov h0, wzr
; ZCZ-FPR64: movi d0, #0
ret half 0.0
}

define float @tf32() {
entry:
; ALL-LABEL: tf32:
; FP-WORKAROUND: mov s0, wzr
; NOZCZ-FPR64: mov s0, wzr
; ZCZ-FPR64: movi d0, #0
ret float 0.0
}

define double @td64() {
entry:
; ALL-LABEL: td64:
; FP-WORKAROUND: mov d0, xzr
; NOZCZ-FPR64: mov d0, xzr
; ZCZ-FPR64: movi d0, #0
ret double 0.0
}

define <8 x i8> @tv8i8() {
entry:
; ALL-LABEL: tv8i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}

define <4 x i16> @tv4i16() {
entry:
; ALL-LABEL: tv4i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i16> <i16 0, i16 0, i16 0, i16 0>
}

define <2 x i32> @tv2i32() {
entry:
; ALL-LABEL: tv2i32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x i32> <i32 0, i32 0>
}

define <2 x float> @tv2f32() {
entry:
; ALL-LABEL: tv2f32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x float> <float 0.0, float 0.0>
}

define <16 x i8> @tv16i8() {
entry:
; ALL-LABEL: tv16i8:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
}

define <8 x i16> @tv8i16() {
entry:
; ALL-LABEL: tv8i16:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>
}

define <4 x i32> @tv4i32() {
entry:
; ALL-LABEL: tv4i32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x i32> <i32 0, i32 0, i32 0, i32 0>
}

define <2 x i64> @tv2i64() {
entry:
; ALL-LABEL: tv2i64:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x i64> <i64 0, i64 0>
}

define <4 x float> @tv4f32() {
entry:
; ALL-LABEL: tv4f32:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>
}

define <2 x double> @tv2d64() {
entry:
; ALL-LABEL: tv2d64:
; FP-WORKAROUND: movi{{(.16b)?}} v0{{(.16b)?}}, #0
; NOZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
; ZCZ-FPR64: movi{{(.2d)?}} v0{{(.2d)?}}, #0
ret <2 x double> <double 0.0, double 0.0>
}

; We used to produce spills+reloads for a Q register with zero cycle zeroing
; enabled.
; ALL-LABEL: foo:
; ALL-NOT: str q{{[0-9]+}}
; ALL-NOT: ldr q{{[0-9]+}}
define double @foo(i32 %n) {
entry:
br label %for.body

for.body:
%phi0 = phi double [ 1.0, %entry ], [ %v0, %for.body ]
%i.076 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%conv21 = sitofp i32 %i.076 to double
%call = tail call fast double @sin(double %conv21)
%cmp.i = fcmp fast olt double %phi0, %call
%v0 = select i1 %cmp.i, double %call, double %phi0
%inc = add nuw nsw i32 %i.076, 1
%cmp = icmp slt i32 %inc, %n
br i1 %cmp, label %for.body, label %for.end

for.end:
ret double %v0
}

declare double @sin(double)
Loading
Loading