Skip to content

Commit 48d8a30

Browse files
committed
[x64] Add AVX512 optimizations for PERMUTE_V128
Uses the single-instruction AVX512 `vperm*` instructions to accelerate the `INT8_TYPE` and `INT16_TYPE` permutation opcodes. The `INT8_TYPE` is accelerated using `AVX512VBMI` subset of AVX512. Available since Icelake(Intel) and Zen4(AMD).
1 parent 1329c38 commit 48d8a30

File tree

1 file changed

+83
-2
lines changed

1 file changed

+83
-2
lines changed

src/xenia/cpu/backend/x64/x64_seq_vector.cc

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -594,7 +594,7 @@ struct VECTOR_ADD
594594

595595
e.vpsrad(e.xmm2, e.xmm1, 31);
596596
e.vpxord(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSignMaskI32));
597-
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
597+
e.vpblendmd(dest | saturate, e.xmm1, e.xmm2);
598598
return;
599599
}
600600

@@ -1776,7 +1776,23 @@ struct PERMUTE_V128
17761776
} else {
17771777
e.vxorps(e.xmm0, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
17781778
}
1779+
1780+
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512VBMI)) {
1781+
Xmm table_lo = e.xmm1;
1782+
if (i.src2.is_constant) {
1783+
e.LoadConstantXmm(table_lo, i.src2.constant());
1784+
} else {
1785+
table_lo = i.src2;
1786+
}
1787+
Opmask zeroes = e.k1;
1788+
// _mm_cmple_epu8_mask
1789+
e.vpcmpub(zeroes, e.xmm0, e.GetXmmConstPtr(XMMPermuteControl15), 2);
1790+
e.vpermb(i.dest.reg() | zeroes | e.T_z, e.xmm0, table_lo);
1791+
return;
1792+
}
1793+
17791794
e.vpand(e.xmm0, e.GetXmmConstPtr(XMMPermuteByteMask));
1795+
17801796
if (i.src2.is_constant) {
17811797
e.LoadConstantXmm(i.dest, i.src2.constant());
17821798
e.vpshufb(i.dest, i.dest, e.xmm0);
@@ -1792,13 +1808,47 @@ struct PERMUTE_V128
17921808
// General permute.
17931809
// Control mask needs to be shuffled.
17941810
// TODO(benvanik): do constants here instead of in generated code.
1811+
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW |
1812+
kX64EmitAVX512VBMI)) {
1813+
Xmm table_idx = e.xmm0;
1814+
if (i.src1.is_constant) {
1815+
e.LoadConstantXmm(table_idx, i.src1.constant());
1816+
e.vxorps(table_idx, table_idx, e.GetXmmConstPtr(XMMSwapWordMask));
1817+
} else {
1818+
e.vxorps(table_idx, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
1819+
}
1820+
1821+
Xmm table_lo = e.xmm1;
1822+
if (i.src2.value->IsConstantZero()) {
1823+
e.vpxor(table_lo, table_lo);
1824+
} else if (i.src2.is_constant) {
1825+
e.LoadConstantXmm(table_lo, i.src2.constant());
1826+
} else {
1827+
table_lo = i.src2;
1828+
}
1829+
1830+
Xmm table_hi = e.xmm2;
1831+
if (i.src3.value->IsConstantZero()) {
1832+
e.vpxor(table_hi, table_hi);
1833+
} else if (i.src3.is_constant) {
1834+
e.LoadConstantXmm(table_hi, i.src3.constant());
1835+
} else {
1836+
table_hi = i.src3;
1837+
}
1838+
1839+
e.vpermi2b(table_idx, table_lo, table_hi);
1840+
e.vmovdqu8(i.dest, table_idx);
1841+
return;
1842+
}
1843+
17951844
if (i.src1.is_constant) {
17961845
e.LoadConstantXmm(e.xmm2, i.src1.constant());
17971846
e.vxorps(e.xmm2, e.xmm2, e.GetXmmConstPtr(XMMSwapWordMask));
17981847
} else {
17991848
e.vxorps(e.xmm2, i.src1, e.GetXmmConstPtr(XMMSwapWordMask));
18001849
}
18011850
e.vpand(e.xmm2, e.GetXmmConstPtr(XMMPermuteByteMask));
1851+
18021852
Xmm src2_shuf = e.xmm0;
18031853
if (i.src2.value->IsConstantZero()) {
18041854
e.vpxor(src2_shuf, src2_shuf);
@@ -1825,8 +1875,39 @@ struct PERMUTE_V128
18251875

18261876
static void EmitByInt16(X64Emitter& e, const EmitArgType& i) {
18271877
// src1 is an array of indices corresponding to positions within src2 and
1828-
// src3.
1878+
// src3
1879+
if (e.IsFeatureEnabled(kX64EmitAVX512Ortho | kX64EmitAVX512BW)) {
1880+
e.LoadConstantXmm(e.xmm1, vec128s(0x1));
1881+
1882+
Xmm table_idx = e.xmm0;
1883+
if (i.src1.is_constant) {
1884+
e.LoadConstantXmm(table_idx, i.src1.constant());
1885+
e.vpxord(table_idx, table_idx, e.xmm1);
1886+
} else {
1887+
e.vpxord(table_idx, i.src1, e.xmm1);
1888+
}
1889+
1890+
Xmm table_lo = e.xmm1;
1891+
if (i.src2.is_constant) {
1892+
e.LoadConstantXmm(table_lo, i.src2.constant());
1893+
} else {
1894+
table_lo = i.src2;
1895+
}
1896+
1897+
Xmm table_hi = e.xmm2;
1898+
if (i.src3.is_constant) {
1899+
e.LoadConstantXmm(table_hi, i.src3.constant());
1900+
} else {
1901+
table_hi = i.src3;
1902+
}
1903+
1904+
e.vpermi2w(table_idx, table_lo, table_hi);
1905+
e.vmovdqu8(i.dest, table_idx);
1906+
return;
1907+
}
1908+
18291909
assert_true(i.src1.is_constant);
1910+
18301911
vec128_t perm = (i.src1.constant() & vec128s(0xF)) ^ vec128s(0x1);
18311912
vec128_t perm_ctrl = vec128b(0);
18321913
for (int i = 0; i < 8; i++) {

0 commit comments

Comments
 (0)