diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 73791102fc04d..5b0a87d5f150f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -849,7 +849,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, AddPromotedToType(Op, MVT::bf16, MVT::f32); } for (const auto &Op : {ISD::FABS}) { - setOperationAction(Op, MVT::f16, Promote); + // Expand instead of Promote to clear sign bit by bitcasting to i16 + setOperationAction(Op, MVT::f16, Expand); setOperationAction(Op, MVT::f32, Legal); setOperationAction(Op, MVT::f64, Legal); setOperationAction(Op, MVT::v2f16, Expand); diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 1a6be4eb5af1e..2e70a210b3335 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -601,7 +601,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, auto setF16Action = [&] (MVT VT, LegalizeAction Action) { setOperationAction(ISD::FABS, VT, Action); setOperationAction(ISD::FNEG, VT, Action); - setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FREM, VT, Action); setOperationAction(ISD::FMA, VT, Action); setOperationAction(ISD::FMINNUM, VT, Action); @@ -672,6 +671,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Half type will be promoted by default. setF16Action(MVT::f16, Promote); + // Expand instead of Promote to clear/flip/copy sign bit by bitcasting to + // i16. + setOperationAction(ISD::FABS, MVT::f16, Expand); + setOperationAction(ISD::FNEG, MVT::f16, Expand); + setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand); setOperationAction(ISD::FADD, MVT::f16, Promote); setOperationAction(ISD::FSUB, MVT::f16, Promote); setOperationAction(ISD::FMUL, MVT::f16, Promote); diff --git a/llvm/test/CodeGen/NVPTX/f16-instructions.ll b/llvm/test/CodeGen/NVPTX/f16-instructions.ll index 14e02a49f6e5e..cfabf1d6639c2 100644 --- a/llvm/test/CodeGen/NVPTX/f16-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16-instructions.ll @@ -981,14 +981,10 @@ define half @test_fma(half %a, half %b, half %c) #0 { } ; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0]; -; CHECK-NOFTZ: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOFTZ: abs.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK-F16-FTZ: cvt.ftz.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-F16-FTZ: abs.ftz.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%rs[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; +; CHECK: ld.param.b16 [[A:%rs[0-9]+]], [test_fabs_param_0]; +; CHECK: and.b16 [[RF:%rs[0-9]+]], [[A]], 32767; +; CHECK: st.param.b16 [func_retval0+0], [[RF]]; +; CHECK: ret; define half @test_fabs(half %a) #0 { %r = call half @llvm.fabs.f16(half %a) ret half %r diff --git a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll index 464b3a754804f..0277f2d7de459 100644 --- a/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1182,18 +1182,15 @@ define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { ret <2 x half> %r } +; TODO: This should be optimised to directly use AND on the i32 register. ; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0]; -; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%rs[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%rs[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%r[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; +; CHECK: ld.param.b32 [[A:%r[0-9]+]], [test_fabs_param_0]; +; CHECK: mov.b32 {[[A0:%rs[0-9]+]], [[A1:%rs[0-9]+]]}, [[A]] +; CHECK: and.b16 [[A2:%rs[0-9]+]], [[A1]], 32767; +; CHECK: and.b16 [[A3:%rs[0-9]+]], [[A0]], 32767; +; CHECK: mov.b32 [[B:%r[0-9]+]], {[[A3]], [[A2]]}; +; CHECK: st.param.b32 [func_retval0+0], [[B]]; +; CHECK: ret; define <2 x half> @test_fabs(<2 x half> %a) #0 { %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) ret <2 x half> %r diff --git a/llvm/test/CodeGen/X86/fp16-libcalls.ll b/llvm/test/CodeGen/X86/fp16-libcalls.ll index 933971212f11d..f6216b7922987 100644 --- a/llvm/test/CodeGen/X86/fp16-libcalls.ll +++ b/llvm/test/CodeGen/X86/fp16-libcalls.ll @@ -350,11 +350,7 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fabs: ; F16C: # %bb.0: ; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: andl $32767, %eax # imm = 0x7FFF ; F16C-NEXT: movw %ax, (%rdi) ; F16C-NEXT: retq ; @@ -367,34 +363,17 @@ define void @test_half_fabs(half %a0, ptr %p0) nounwind { ; ; X64-LABEL: test_half_fabs: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: callq __truncsfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %eax -; X64-NEXT: movw %ax, (%rbx) -; X64-NEXT: popq %rbx +; X64-NEXT: andl $32767, %eax # imm = 0x7FFF +; X64-NEXT: movw %ax, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: test_half_fabs: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: pextrw $0, %xmm0, %eax -; X86-NEXT: movw %ax, (%esp) -; X86-NEXT: calll __extendhfsf2 -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: calll __truncsfhf2 -; X86-NEXT: pextrw $0, %xmm0, %eax -; X86-NEXT: movw %ax, (%esi) -; X86-NEXT: addl $8, %esp -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andl $32767, %ecx # imm = 0x7FFF +; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: retl %res = call half @llvm.fabs.half(half %a0) store half %res, ptr %p0, align 2 @@ -555,11 +534,7 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind { ; F16C-LABEL: test_half_fneg: ; F16C: # %bb.0: ; F16C-NEXT: vpextrw $0, %xmm0, %eax -; F16C-NEXT: vmovd %eax, %xmm0 -; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 -; F16C-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 -; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 -; F16C-NEXT: vmovd %xmm0, %eax +; F16C-NEXT: xorl $32768, %eax # imm = 0x8000 ; F16C-NEXT: movw %ax, (%rdi) ; F16C-NEXT: retq ; @@ -572,34 +547,17 @@ define void @test_half_fneg(half %a0, ptr %p0) nounwind { ; ; X64-LABEL: test_half_fneg: ; X64: # %bb.0: -; X64-NEXT: pushq %rbx -; X64-NEXT: movq %rdi, %rbx -; X64-NEXT: callq __extendhfsf2@PLT -; X64-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; X64-NEXT: callq __truncsfhf2@PLT ; X64-NEXT: pextrw $0, %xmm0, %eax -; X64-NEXT: movw %ax, (%rbx) -; X64-NEXT: popq %rbx +; X64-NEXT: xorl $32768, %eax # imm = 0x8000 +; X64-NEXT: movw %ax, (%rdi) ; X64-NEXT: retq ; ; X86-LABEL: test_half_fneg: ; X86: # %bb.0: -; X86-NEXT: pushl %esi -; X86-NEXT: subl $8, %esp -; X86-NEXT: pinsrw $0, {{[0-9]+}}(%esp), %xmm0 -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: pextrw $0, %xmm0, %eax -; X86-NEXT: movw %ax, (%esp) -; X86-NEXT: calll __extendhfsf2 -; X86-NEXT: fstps {{[0-9]+}}(%esp) -; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X86-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 -; X86-NEXT: movd %xmm0, (%esp) -; X86-NEXT: calll __truncsfhf2 -; X86-NEXT: pextrw $0, %xmm0, %eax -; X86-NEXT: movw %ax, (%esi) -; X86-NEXT: addl $8, %esp -; X86-NEXT: popl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl $32768, %ecx # imm = 0x8000 +; X86-NEXT: xorl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movw %cx, (%eax) ; X86-NEXT: retl %res = fneg half %a0 store half %res, ptr %p0, align 2 diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index 9f01d07e6a670..6838925240058 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1059,7 +1059,6 @@ define void @main.158() #0 { ; CHECK-LIBCALL: # %bb.0: # %entry ; CHECK-LIBCALL-NEXT: pushq %rax ; CHECK-LIBCALL-NEXT: xorps %xmm0, %xmm0 -; CHECK-LIBCALL-NEXT: callq __truncsfhf2@PLT ; CHECK-LIBCALL-NEXT: callq __extendhfsf2@PLT ; CHECK-LIBCALL-NEXT: movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-LIBCALL-NEXT: ucomiss %xmm0, %xmm1 @@ -1077,10 +1076,10 @@ define void @main.158() #0 { ; BWON-F16C-LABEL: main.158: ; BWON-F16C: # %bb.0: # %entry ; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm1 -; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 -; BWON-F16C-NEXT: vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0] -; BWON-F16C-NEXT: vucomiss %xmm1, %xmm2 +; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 +; BWON-F16C-NEXT: vmovss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0] +; BWON-F16C-NEXT: vucomiss %xmm0, %xmm1 +; BWON-F16C-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: jae .LBB20_2 ; BWON-F16C-NEXT: # %bb.1: # %entry ; BWON-F16C-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] @@ -1093,8 +1092,7 @@ define void @main.158() #0 { ; CHECK-I686-LABEL: main.158: ; CHECK-I686: # %bb.0: # %entry ; CHECK-I686-NEXT: subl $12, %esp -; CHECK-I686-NEXT: movl $0, (%esp) -; CHECK-I686-NEXT: calll __truncsfhf2 +; CHECK-I686-NEXT: pxor %xmm0, %xmm0 ; CHECK-I686-NEXT: pextrw $0, %xmm0, %eax ; CHECK-I686-NEXT: movw %ax, (%esp) ; CHECK-I686-NEXT: calll __extendhfsf2