Skip to content

Performance regression with auto-vectorization from 1.87 onwards #145123

@okaneco

Description

@okaneco

Code

I tried this code:

pub fn unfilter_4(previous: &[u8], current: &mut [u8]) {
    let mut a_bpp = [0; 4];
    let mut c_bpp = [0; 4];

    for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
        let new_chunk = [
            chunk[0].wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
            chunk[1].wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
            chunk[2].wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
            chunk[3].wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
        ];
        *TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
        a_bpp = new_chunk;
        c_bpp = b_bpp.try_into().unwrap();
    }
}

fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
    let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
    let lo = a.min(b);
    let hi = a.max(b);
    let t0 = if hi as i16 <= thresh { lo } else { c };
    let t1 = if thresh <= lo as i16 { hi } else { t0 };
    t1
}

The function is auto-vectorized but 60% slower after stable version 1.86.

After the LLVM 20 upgrade, the end of the loop appears to be doing unnecessary shuffling, unpacking, and packing.

Rust, 1.86 vs. nightly - https://rust.godbolt.org/z/dGqjzaaoK
LLVM IR diff, 1.86 vs. nightly - https://rust.godbolt.org/z/G57654Gs3
llc, the 1.86 IR still optimizes to the expected output - https://alive2.llvm.org/ce/z/2J4zb3

Assembly ouput
; 1.86                                          ; 1.87 onwards
.LBB0_2:                                        .LBB0_2:
        movdqa  xmm4, xmm1                              movdqa  xmm5, xmm8
        punpcklbw       xmm4, xmm0                      punpcklbw       xmm5, xmm0
        movd    xmm2, dword ptr [rdi + 4*rax]           pmullw  xmm5, xmm1
        movdqa  xmm5, xmm2                              movdqa  xmm10, xmm7
        movdqa  xmm6, xmm2                              punpcklbw       xmm10, xmm0
        pminub  xmm6, xmm3                              movd    xmm9, dword ptr [rdx + 4*rax]
        pmaxub  xmm2, xmm3                              movd    xmm6, dword ptr [rdi + 4*rax]
        movdqa  xmm7, xmm3                              movdqa  xmm11, xmm6
        punpcklbw       xmm7, xmm0                      punpcklbw       xmm11, xmm0
        movdqa  xmm3, xmm4                              paddw   xmm11, xmm10
        paddw   xmm3, xmm4                              psubw   xmm5, xmm11
        paddw   xmm3, xmm4                              movdqa  xmm10, xmm6
        movd    xmm4, dword ptr [rdx + 4*rax]           pminub  xmm10, xmm7
        punpcklbw       xmm5, xmm0                      pmaxub  xmm7, xmm6
        paddw   xmm5, xmm7                              movdqa  xmm11, xmm7
        psubw   xmm3, xmm5                              punpcklbw       xmm11, xmm0
        movdqa  xmm5, xmm2                              pcmpgtw xmm11, xmm5
        punpcklbw       xmm5, xmm0                      packsswb        xmm11, xmm11
        pcmpgtw xmm5, xmm3                              pand    xmm8, xmm11
        packsswb        xmm5, xmm5                      pandn   xmm11, xmm10
        pand    xmm1, xmm5                              por     xmm11, xmm8
        pandn   xmm5, xmm6                              punpcklbw       xmm10, xmm0
        por     xmm5, xmm1                              pcmpgtw xmm5, xmm10
        punpcklbw       xmm6, xmm0                      packsswb        xmm5, xmm5
        pcmpgtw xmm3, xmm6                              pand    xmm11, xmm5
        packsswb        xmm3, xmm3                      pandn   xmm5, xmm7
        pand    xmm5, xmm3                              por     xmm5, xmm11
        pandn   xmm3, xmm2                              paddb   xmm5, xmm9
        por     xmm3, xmm5                              movd    dword ptr [rdx + 4*rax], xmm5
        paddb   xmm3, xmm4                              lea     rcx, [rax + 1]
        movd    dword ptr [rdx + 4*rax], xmm3           pshufd  xmm7, xmm6, 80
        movd    xmm1, dword ptr [rdi + 4*rax]           pshufd  xmm8, xmm6, 0
        lea     rcx, [rax + 1]                          psrld   xmm8, 24
        mov     rax, rcx                                psrld   xmm7, 16
        cmp     rsi, rcx                                punpckldq       xmm7, xmm8
        jne     .LBB0_2                                 movdqa  xmm8, xmm6
                                                        psrld   xmm8, 8
                                                        punpckldq       xmm6, xmm8
                                                        pand    xmm6, xmm2
                                                        packuswb        xmm6, xmm6
                                                        packuswb        xmm6, xmm6
                                                        pand    xmm6, xmm3
                                                        pshuflw xmm7, xmm7, 132
                                                        pand    xmm7, xmm4
                                                        packuswb        xmm7, xmm7
                                                        movdqa  xmm8, xmm3
                                                        pandn   xmm8, xmm7
                                                        por     xmm8, xmm6
                                                        mov     rax, rcx
                                                        movdqa  xmm7, xmm5
                                                        cmp     rsi, rcx
                                                        jne     .LBB0_2
LLVM IR

1.86

define void @example::unfilter_4::h59b66a4b9d6665c5(ptr noalias nocapture noundef nonnull readonly align 1 %previous.0, i64 noundef %previous.1, ptr noalias nocapture noundef nonnull align 1 %current.0, i64 noundef %current.1) unnamed_addr personality ptr @rust_eh_personality {
_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit:
  %n.i.i.i.i79 = lshr i64 %current.1, 2
  %n.i.i3.i.i80 = lshr i64 %previous.1, 2
  %_0.sroa.0.0.sroa.speculated.i.i.i = tail call noundef i64 @llvm.umin.i64(i64 %n.i.i3.i.i80, i64 %n.i.i.i.i79)
  %_2.i84.not = icmp eq i64 %_0.sroa.0.0.sroa.speculated.i.i.i, 0
  br i1 %_2.i84.not, label %bb7, label %bb18

bb7:
  ret void

bb18:
  %iter.sroa.16.085 = phi i64 [ %2, %bb18 ], [ 0, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
  %0 = phi <4 x i8> [ %24, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
  %1 = phi <4 x i8> [ %19, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
  %2 = add nuw nsw i64 %iter.sroa.16.085, 1
  %start1.i.i = shl i64 %iter.sroa.16.085, 2
  %data.i.i = getelementptr inbounds i8, ptr %current.0, i64 %start1.i.i
  %data.i4.i = getelementptr inbounds i8, ptr %previous.0, i64 %start1.i.i
  %3 = zext <4 x i8> %0 to <4 x i16>
  %4 = zext <4 x i8> %1 to <4 x i16>
  %5 = load <4 x i8>, ptr %data.i.i, align 1
  %6 = load <4 x i8>, ptr %data.i4.i, align 1
  %7 = mul nuw nsw <4 x i16> %3, <i16 3, i16 3, i16 3, i16 3>
  %8 = zext <4 x i8> %6 to <4 x i16>
  %9 = add nuw nsw <4 x i16> %4, %8
  %10 = sub nsw <4 x i16> %7, %9
  %11 = tail call <4 x i8> @llvm.umin.v4i8(<4 x i8> %6, <4 x i8> %1)
  %12 = tail call <4 x i8> @llvm.umax.v4i8(<4 x i8> %6, <4 x i8> %1)
  %13 = zext <4 x i8> %12 to <4 x i16>
  %14 = icmp slt <4 x i16> %10, %13
  %15 = select <4 x i1> %14, <4 x i8> %0, <4 x i8> %11
  %16 = zext <4 x i8> %11 to <4 x i16>
  %17 = icmp sgt <4 x i16> %10, %16
  %18 = select <4 x i1> %17, <4 x i8> %15, <4 x i8> %12
  %19 = add <4 x i8> %18, %5
  store <4 x i8> %19, ptr %data.i.i, align 1
  %v.sroa.0.0.copyload = load i32, ptr %data.i4.i, align 1
  %.sroa.4.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 8
  %.sroa.531.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 16
  %.sroa.6.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 24
  %20 = insertelement <4 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
  %21 = insertelement <4 x i32> %20, i32 %.sroa.4.0.extract.shift, i64 1
  %22 = insertelement <4 x i32> %21, i32 %.sroa.531.0.extract.shift, i64 2
  %23 = insertelement <4 x i32> %22, i32 %.sroa.6.0.extract.shift, i64 3
  %24 = trunc <4 x i32> %23 to <4 x i8>
  %exitcond.not = icmp eq i64 %2, %_0.sroa.0.0.sroa.speculated.i.i.i
  br i1 %exitcond.not, label %bb7, label %bb18
}

declare noundef range(i32 0, 10) i32 @rust_eh_personality(i32 noundef, i32 noundef range(i32 1, 17), i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1

declare i64 @llvm.umin.i64(i64, i64) #2

declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) #2

declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) #2

nightly

define void @example::unfilter_4::he235d5dcdbd650d1(ptr noalias nocapture noundef nonnull readonly align 1 %previous.0, i64 noundef %previous.1, ptr noalias nocapture noundef nonnull align 1 %current.0, i64 noundef %current.1) unnamed_addr personality ptr @rust_eh_personality {
_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit:
  %n.i.i.i.i80 = lshr i64 %current.1, 2
  %n.i.i3.i.i81 = lshr i64 %previous.1, 2
  %_0.sroa.0.0.sroa.speculated.i.i.i = tail call noundef i64 @llvm.umin.i64(i64 %n.i.i3.i.i81, i64 %n.i.i.i.i80)
  %_2.i85.not = icmp eq i64 %_0.sroa.0.0.sroa.speculated.i.i.i, 0
  br i1 %_2.i85.not, label %bb7, label %bb18

bb7:
  ret void

bb18:
  %iter.sroa.16.086 = phi i64 [ %2, %bb18 ], [ 0, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
  %0 = phi <4 x i8> [ %26, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
  %1 = phi <4 x i8> [ %18, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
  %2 = add nuw nsw i64 %iter.sroa.16.086, 1
  %start1.i.i = shl i64 %iter.sroa.16.086, 2
  %data.i.i = getelementptr inbounds nuw i8, ptr %current.0, i64 %start1.i.i
  %data.i4.i = getelementptr inbounds nuw i8, ptr %previous.0, i64 %start1.i.i
  %3 = zext <4 x i8> %0 to <4 x i16>
  %4 = zext <4 x i8> %1 to <4 x i16>
  %v.sroa.0.0.copyload = load i32, ptr %data.i4.i, align 1
  %.sroa.4.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 8
  %5 = load <4 x i8>, ptr %data.i.i, align 1
  %.cast = bitcast i32 %v.sroa.0.0.copyload to <4 x i8>
  %6 = mul nuw nsw <4 x i16> %3, splat (i16 3)
  %7 = zext <4 x i8> %.cast to <4 x i16>
  %8 = add nuw nsw <4 x i16> %4, %7
  %9 = sub nsw <4 x i16> %6, %8
  %10 = tail call <4 x i8> @llvm.umin.v4i8(<4 x i8> %.cast, <4 x i8> %1)
  %11 = tail call <4 x i8> @llvm.umax.v4i8(<4 x i8> %.cast, <4 x i8> %1)
  %12 = zext <4 x i8> %11 to <4 x i16>
  %13 = icmp slt <4 x i16> %9, %12
  %14 = select <4 x i1> %13, <4 x i8> %0, <4 x i8> %10
  %15 = zext <4 x i8> %10 to <4 x i16>
  %16 = icmp sgt <4 x i16> %9, %15
  %17 = select <4 x i1> %16, <4 x i8> %14, <4 x i8> %11
  %18 = add <4 x i8> %17, %5
  store <4 x i8> %18, ptr %data.i.i, align 1
  %19 = insertelement <2 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
  %20 = shufflevector <2 x i32> %19, <2 x i32> poison, <2 x i32> zeroinitializer
  %21 = lshr <2 x i32> %20, <i32 16, i32 24>
  %22 = insertelement <4 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
  %23 = insertelement <4 x i32> %22, i32 %.sroa.4.0.extract.shift, i64 1
  %24 = shufflevector <2 x i32> %21, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
  %25 = shufflevector <4 x i32> %23, <4 x i32> %24, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %26 = trunc <4 x i32> %25 to <4 x i8>
  %exitcond.not = icmp eq i64 %2, %_0.sroa.0.0.sroa.speculated.i.i.i
  br i1 %exitcond.not, label %bb7, label %bb18
}

declare noundef range(i32 0, 10) i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1

declare i64 @llvm.umin.i64(i64, i64) #2

declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) #2

declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) #2

Version it worked on

It most recently worked on: Rust 1.86

rustc 1.86.0 (05f9846f8 2025-03-31)
binary: rustc
commit-hash: 05f9846f893b09a1be1fc8560e33fc3c815cfecb
commit-date: 2025-03-31
host: x86_64-unknown-linux-gnu
release: 1.86.0
LLVM version: 19.1.7

Version with regression

rustc 1.87.0 (17067e9ac 2025-05-09)
binary: rustc
commit-hash: 17067e9ac6d7ecb70e50f92c1944e545188d2359
commit-date: 2025-05-09
host: x86_64-unknown-linux-gnu
release: 1.87.0
LLVM version: 20.1.1

Other context

I think this is part of the regression related to the png crate issue for reduced performance. image-rs/image-png#598
#142519 was filed for that issue but I think it's been reduced beyond this specific filter vectorization regression.

@rustbot modify labels: +regression-from-stable-to-stable -regression-untriaged

Metadata

Metadata

Assignees

No one assigned

    Labels

    A-LLVMArea: Code generation parts specific to LLVM. Both correctness bugs and optimization-related issues.C-bugCategory: This is a bug.I-prioritizeIssue: Indicates that prioritization has been requested for this issue.needs-triageThis issue may need triage. Remove it if it has been sufficiently triaged.regression-from-stable-to-stablePerformance or correctness regression from one stable version to another.

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions