-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Description
Code
I tried this code:
pub fn unfilter_4(previous: &[u8], current: &mut [u8]) {
let mut a_bpp = [0; 4];
let mut c_bpp = [0; 4];
for (chunk, b_bpp) in current.chunks_exact_mut(4).zip(previous.chunks_exact(4)) {
let new_chunk = [
chunk[0].wrapping_add(filter_paeth_decode(a_bpp[0], b_bpp[0], c_bpp[0])),
chunk[1].wrapping_add(filter_paeth_decode(a_bpp[1], b_bpp[1], c_bpp[1])),
chunk[2].wrapping_add(filter_paeth_decode(a_bpp[2], b_bpp[2], c_bpp[2])),
chunk[3].wrapping_add(filter_paeth_decode(a_bpp[3], b_bpp[3], c_bpp[3])),
];
*TryInto::<&mut [u8; 4]>::try_into(chunk).unwrap() = new_chunk;
a_bpp = new_chunk;
c_bpp = b_bpp.try_into().unwrap();
}
}
fn filter_paeth_decode(a: u8, b: u8, c: u8) -> u8 {
let thresh = i16::from(c) * 3 - (i16::from(a) + i16::from(b));
let lo = a.min(b);
let hi = a.max(b);
let t0 = if hi as i16 <= thresh { lo } else { c };
let t1 = if thresh <= lo as i16 { hi } else { t0 };
t1
}
The function is auto-vectorized but 60% slower after stable version 1.86.
After the LLVM 20 upgrade, the end of the loop appears to be doing unnecessary shuffling, unpacking, and packing.
Rust, 1.86 vs. nightly - https://rust.godbolt.org/z/dGqjzaaoK
LLVM IR diff, 1.86 vs. nightly - https://rust.godbolt.org/z/G57654Gs3
llc, the 1.86 IR still optimizes to the expected output - https://alive2.llvm.org/ce/z/2J4zb3
Assembly ouput
; 1.86 ; 1.87 onwards
.LBB0_2: .LBB0_2:
movdqa xmm4, xmm1 movdqa xmm5, xmm8
punpcklbw xmm4, xmm0 punpcklbw xmm5, xmm0
movd xmm2, dword ptr [rdi + 4*rax] pmullw xmm5, xmm1
movdqa xmm5, xmm2 movdqa xmm10, xmm7
movdqa xmm6, xmm2 punpcklbw xmm10, xmm0
pminub xmm6, xmm3 movd xmm9, dword ptr [rdx + 4*rax]
pmaxub xmm2, xmm3 movd xmm6, dword ptr [rdi + 4*rax]
movdqa xmm7, xmm3 movdqa xmm11, xmm6
punpcklbw xmm7, xmm0 punpcklbw xmm11, xmm0
movdqa xmm3, xmm4 paddw xmm11, xmm10
paddw xmm3, xmm4 psubw xmm5, xmm11
paddw xmm3, xmm4 movdqa xmm10, xmm6
movd xmm4, dword ptr [rdx + 4*rax] pminub xmm10, xmm7
punpcklbw xmm5, xmm0 pmaxub xmm7, xmm6
paddw xmm5, xmm7 movdqa xmm11, xmm7
psubw xmm3, xmm5 punpcklbw xmm11, xmm0
movdqa xmm5, xmm2 pcmpgtw xmm11, xmm5
punpcklbw xmm5, xmm0 packsswb xmm11, xmm11
pcmpgtw xmm5, xmm3 pand xmm8, xmm11
packsswb xmm5, xmm5 pandn xmm11, xmm10
pand xmm1, xmm5 por xmm11, xmm8
pandn xmm5, xmm6 punpcklbw xmm10, xmm0
por xmm5, xmm1 pcmpgtw xmm5, xmm10
punpcklbw xmm6, xmm0 packsswb xmm5, xmm5
pcmpgtw xmm3, xmm6 pand xmm11, xmm5
packsswb xmm3, xmm3 pandn xmm5, xmm7
pand xmm5, xmm3 por xmm5, xmm11
pandn xmm3, xmm2 paddb xmm5, xmm9
por xmm3, xmm5 movd dword ptr [rdx + 4*rax], xmm5
paddb xmm3, xmm4 lea rcx, [rax + 1]
movd dword ptr [rdx + 4*rax], xmm3 pshufd xmm7, xmm6, 80
movd xmm1, dword ptr [rdi + 4*rax] pshufd xmm8, xmm6, 0
lea rcx, [rax + 1] psrld xmm8, 24
mov rax, rcx psrld xmm7, 16
cmp rsi, rcx punpckldq xmm7, xmm8
jne .LBB0_2 movdqa xmm8, xmm6
psrld xmm8, 8
punpckldq xmm6, xmm8
pand xmm6, xmm2
packuswb xmm6, xmm6
packuswb xmm6, xmm6
pand xmm6, xmm3
pshuflw xmm7, xmm7, 132
pand xmm7, xmm4
packuswb xmm7, xmm7
movdqa xmm8, xmm3
pandn xmm8, xmm7
por xmm8, xmm6
mov rax, rcx
movdqa xmm7, xmm5
cmp rsi, rcx
jne .LBB0_2
LLVM IR
1.86
define void @example::unfilter_4::h59b66a4b9d6665c5(ptr noalias nocapture noundef nonnull readonly align 1 %previous.0, i64 noundef %previous.1, ptr noalias nocapture noundef nonnull align 1 %current.0, i64 noundef %current.1) unnamed_addr personality ptr @rust_eh_personality {
_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit:
%n.i.i.i.i79 = lshr i64 %current.1, 2
%n.i.i3.i.i80 = lshr i64 %previous.1, 2
%_0.sroa.0.0.sroa.speculated.i.i.i = tail call noundef i64 @llvm.umin.i64(i64 %n.i.i3.i.i80, i64 %n.i.i.i.i79)
%_2.i84.not = icmp eq i64 %_0.sroa.0.0.sroa.speculated.i.i.i, 0
br i1 %_2.i84.not, label %bb7, label %bb18
bb7:
ret void
bb18:
%iter.sroa.16.085 = phi i64 [ %2, %bb18 ], [ 0, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
%0 = phi <4 x i8> [ %24, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
%1 = phi <4 x i8> [ %19, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h555de13c53d19f33E.exit ]
%2 = add nuw nsw i64 %iter.sroa.16.085, 1
%start1.i.i = shl i64 %iter.sroa.16.085, 2
%data.i.i = getelementptr inbounds i8, ptr %current.0, i64 %start1.i.i
%data.i4.i = getelementptr inbounds i8, ptr %previous.0, i64 %start1.i.i
%3 = zext <4 x i8> %0 to <4 x i16>
%4 = zext <4 x i8> %1 to <4 x i16>
%5 = load <4 x i8>, ptr %data.i.i, align 1
%6 = load <4 x i8>, ptr %data.i4.i, align 1
%7 = mul nuw nsw <4 x i16> %3, <i16 3, i16 3, i16 3, i16 3>
%8 = zext <4 x i8> %6 to <4 x i16>
%9 = add nuw nsw <4 x i16> %4, %8
%10 = sub nsw <4 x i16> %7, %9
%11 = tail call <4 x i8> @llvm.umin.v4i8(<4 x i8> %6, <4 x i8> %1)
%12 = tail call <4 x i8> @llvm.umax.v4i8(<4 x i8> %6, <4 x i8> %1)
%13 = zext <4 x i8> %12 to <4 x i16>
%14 = icmp slt <4 x i16> %10, %13
%15 = select <4 x i1> %14, <4 x i8> %0, <4 x i8> %11
%16 = zext <4 x i8> %11 to <4 x i16>
%17 = icmp sgt <4 x i16> %10, %16
%18 = select <4 x i1> %17, <4 x i8> %15, <4 x i8> %12
%19 = add <4 x i8> %18, %5
store <4 x i8> %19, ptr %data.i.i, align 1
%v.sroa.0.0.copyload = load i32, ptr %data.i4.i, align 1
%.sroa.4.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 8
%.sroa.531.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 16
%.sroa.6.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 24
%20 = insertelement <4 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
%21 = insertelement <4 x i32> %20, i32 %.sroa.4.0.extract.shift, i64 1
%22 = insertelement <4 x i32> %21, i32 %.sroa.531.0.extract.shift, i64 2
%23 = insertelement <4 x i32> %22, i32 %.sroa.6.0.extract.shift, i64 3
%24 = trunc <4 x i32> %23 to <4 x i8>
%exitcond.not = icmp eq i64 %2, %_0.sroa.0.0.sroa.speculated.i.i.i
br i1 %exitcond.not, label %bb7, label %bb18
}
declare noundef range(i32 0, 10) i32 @rust_eh_personality(i32 noundef, i32 noundef range(i32 1, 17), i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1
declare i64 @llvm.umin.i64(i64, i64) #2
declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) #2
declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) #2
nightly
define void @example::unfilter_4::he235d5dcdbd650d1(ptr noalias nocapture noundef nonnull readonly align 1 %previous.0, i64 noundef %previous.1, ptr noalias nocapture noundef nonnull align 1 %current.0, i64 noundef %current.1) unnamed_addr personality ptr @rust_eh_personality {
_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit:
%n.i.i.i.i80 = lshr i64 %current.1, 2
%n.i.i3.i.i81 = lshr i64 %previous.1, 2
%_0.sroa.0.0.sroa.speculated.i.i.i = tail call noundef i64 @llvm.umin.i64(i64 %n.i.i3.i.i81, i64 %n.i.i.i.i80)
%_2.i85.not = icmp eq i64 %_0.sroa.0.0.sroa.speculated.i.i.i, 0
br i1 %_2.i85.not, label %bb7, label %bb18
bb7:
ret void
bb18:
%iter.sroa.16.086 = phi i64 [ %2, %bb18 ], [ 0, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
%0 = phi <4 x i8> [ %26, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
%1 = phi <4 x i8> [ %18, %bb18 ], [ zeroinitializer, %_ZN4core4iter6traits8iterator8Iterator3zip17h05011f840b2372e0E.exit ]
%2 = add nuw nsw i64 %iter.sroa.16.086, 1
%start1.i.i = shl i64 %iter.sroa.16.086, 2
%data.i.i = getelementptr inbounds nuw i8, ptr %current.0, i64 %start1.i.i
%data.i4.i = getelementptr inbounds nuw i8, ptr %previous.0, i64 %start1.i.i
%3 = zext <4 x i8> %0 to <4 x i16>
%4 = zext <4 x i8> %1 to <4 x i16>
%v.sroa.0.0.copyload = load i32, ptr %data.i4.i, align 1
%.sroa.4.0.extract.shift = lshr i32 %v.sroa.0.0.copyload, 8
%5 = load <4 x i8>, ptr %data.i.i, align 1
%.cast = bitcast i32 %v.sroa.0.0.copyload to <4 x i8>
%6 = mul nuw nsw <4 x i16> %3, splat (i16 3)
%7 = zext <4 x i8> %.cast to <4 x i16>
%8 = add nuw nsw <4 x i16> %4, %7
%9 = sub nsw <4 x i16> %6, %8
%10 = tail call <4 x i8> @llvm.umin.v4i8(<4 x i8> %.cast, <4 x i8> %1)
%11 = tail call <4 x i8> @llvm.umax.v4i8(<4 x i8> %.cast, <4 x i8> %1)
%12 = zext <4 x i8> %11 to <4 x i16>
%13 = icmp slt <4 x i16> %9, %12
%14 = select <4 x i1> %13, <4 x i8> %0, <4 x i8> %10
%15 = zext <4 x i8> %10 to <4 x i16>
%16 = icmp sgt <4 x i16> %9, %15
%17 = select <4 x i1> %16, <4 x i8> %14, <4 x i8> %11
%18 = add <4 x i8> %17, %5
store <4 x i8> %18, ptr %data.i.i, align 1
%19 = insertelement <2 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
%20 = shufflevector <2 x i32> %19, <2 x i32> poison, <2 x i32> zeroinitializer
%21 = lshr <2 x i32> %20, <i32 16, i32 24>
%22 = insertelement <4 x i32> poison, i32 %v.sroa.0.0.copyload, i64 0
%23 = insertelement <4 x i32> %22, i32 %.sroa.4.0.extract.shift, i64 1
%24 = shufflevector <2 x i32> %21, <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
%25 = shufflevector <4 x i32> %23, <4 x i32> %24, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%26 = trunc <4 x i32> %25 to <4 x i8>
%exitcond.not = icmp eq i64 %2, %_0.sroa.0.0.sroa.speculated.i.i.i
br i1 %exitcond.not, label %bb7, label %bb18
}
declare noundef range(i32 0, 10) i32 @rust_eh_personality(i32 noundef, i32 noundef, i64 noundef, ptr noundef, ptr noundef) unnamed_addr #1
declare i64 @llvm.umin.i64(i64, i64) #2
declare <4 x i8> @llvm.umin.v4i8(<4 x i8>, <4 x i8>) #2
declare <4 x i8> @llvm.umax.v4i8(<4 x i8>, <4 x i8>) #2
Version it worked on
It most recently worked on: Rust 1.86
rustc 1.86.0 (05f9846f8 2025-03-31)
binary: rustc
commit-hash: 05f9846f893b09a1be1fc8560e33fc3c815cfecb
commit-date: 2025-03-31
host: x86_64-unknown-linux-gnu
release: 1.86.0
LLVM version: 19.1.7
Version with regression
rustc 1.87.0 (17067e9ac 2025-05-09)
binary: rustc
commit-hash: 17067e9ac6d7ecb70e50f92c1944e545188d2359
commit-date: 2025-05-09
host: x86_64-unknown-linux-gnu
release: 1.87.0
LLVM version: 20.1.1
Other context
I think this is part of the regression related to the png crate issue for reduced performance. image-rs/image-png#598
#142519 was filed for that issue but I think it's been reduced beyond this specific filter vectorization regression.
@rustbot modify labels: +regression-from-stable-to-stable -regression-untriaged