@@ -594,7 +594,7 @@ struct VECTOR_ADD
594594
595595 e.vpsrad (e.xmm2 , e.xmm1 , 31 );
596596 e.vpxord (e.xmm2 , e.xmm2 , e.GetXmmConstPtr (XMMSignMaskI32));
597- e.vpblendmd (dest | saturate, e.xmm1 , e.xmm2 );
597+ e.vpblendmd (dest | saturate, e.xmm1 , e.xmm2 );
598598 return ;
599599 }
600600
@@ -1776,7 +1776,23 @@ struct PERMUTE_V128
17761776 } else {
17771777 e.vxorps (e.xmm0 , i.src1 , e.GetXmmConstPtr (XMMSwapWordMask));
17781778 }
1779+
1780+ if (e.IsFeatureEnabled (kX64EmitAVX512Ortho | kX64EmitAVX512VBMI )) {
1781+ Xmm table_lo = e.xmm1 ;
1782+ if (i.src2 .is_constant ) {
1783+ e.LoadConstantXmm (table_lo, i.src2 .constant ());
1784+ } else {
1785+ table_lo = i.src2 ;
1786+ }
1787+ Opmask zeroes = e.k1 ;
1788+ // _mm_cmple_epu8_mask
1789+ e.vpcmpub (zeroes, e.xmm0 , e.GetXmmConstPtr (XMMPermuteControl15), 2 );
1790+ e.vpermb (i.dest .reg () | zeroes | e.T_z , e.xmm0 , table_lo);
1791+ return ;
1792+ }
1793+
17791794 e.vpand (e.xmm0 , e.GetXmmConstPtr (XMMPermuteByteMask));
1795+
17801796 if (i.src2 .is_constant ) {
17811797 e.LoadConstantXmm (i.dest , i.src2 .constant ());
17821798 e.vpshufb (i.dest , i.dest , e.xmm0 );
@@ -1792,13 +1808,47 @@ struct PERMUTE_V128
17921808 // General permute.
17931809 // Control mask needs to be shuffled.
17941810 // TODO(benvanik): do constants here instead of in generated code.
1811+ if (e.IsFeatureEnabled (kX64EmitAVX512Ortho | kX64EmitAVX512BW |
1812+ kX64EmitAVX512VBMI )) {
1813+ Xmm table_idx = e.xmm0 ;
1814+ if (i.src1 .is_constant ) {
1815+ e.LoadConstantXmm (table_idx, i.src1 .constant ());
1816+ e.vxorps (table_idx, table_idx, e.GetXmmConstPtr (XMMSwapWordMask));
1817+ } else {
1818+ e.vxorps (table_idx, i.src1 , e.GetXmmConstPtr (XMMSwapWordMask));
1819+ }
1820+
1821+ Xmm table_lo = e.xmm1 ;
1822+ if (i.src2 .value ->IsConstantZero ()) {
1823+ e.vpxor (table_lo, table_lo);
1824+ } else if (i.src2 .is_constant ) {
1825+ e.LoadConstantXmm (table_lo, i.src2 .constant ());
1826+ } else {
1827+ table_lo = i.src2 ;
1828+ }
1829+
1830+ Xmm table_hi = e.xmm2 ;
1831+ if (i.src3 .value ->IsConstantZero ()) {
1832+ e.vpxor (table_hi, table_hi);
1833+ } else if (i.src3 .is_constant ) {
1834+ e.LoadConstantXmm (table_hi, i.src3 .constant ());
1835+ } else {
1836+ table_hi = i.src3 ;
1837+ }
1838+
1839+ e.vpermi2b (table_idx, table_lo, table_hi);
1840+ e.vmovdqu8 (i.dest , table_idx);
1841+ return ;
1842+ }
1843+
17951844 if (i.src1 .is_constant ) {
17961845 e.LoadConstantXmm (e.xmm2 , i.src1 .constant ());
17971846 e.vxorps (e.xmm2 , e.xmm2 , e.GetXmmConstPtr (XMMSwapWordMask));
17981847 } else {
17991848 e.vxorps (e.xmm2 , i.src1 , e.GetXmmConstPtr (XMMSwapWordMask));
18001849 }
18011850 e.vpand (e.xmm2 , e.GetXmmConstPtr (XMMPermuteByteMask));
1851+
18021852 Xmm src2_shuf = e.xmm0 ;
18031853 if (i.src2 .value ->IsConstantZero ()) {
18041854 e.vpxor (src2_shuf, src2_shuf);
@@ -1825,8 +1875,39 @@ struct PERMUTE_V128
18251875
18261876 static void EmitByInt16 (X64Emitter& e, const EmitArgType& i) {
18271877 // src1 is an array of indices corresponding to positions within src2 and
1828- // src3.
1878+ // src3
1879+ if (e.IsFeatureEnabled (kX64EmitAVX512Ortho | kX64EmitAVX512BW )) {
1880+ e.LoadConstantXmm (e.xmm1 , vec128s (0x1 ));
1881+
1882+ Xmm table_idx = e.xmm0 ;
1883+ if (i.src1 .is_constant ) {
1884+ e.LoadConstantXmm (table_idx, i.src1 .constant ());
1885+ e.vpxord (table_idx, table_idx, e.xmm1 );
1886+ } else {
1887+ e.vpxord (table_idx, i.src1 , e.xmm1 );
1888+ }
1889+
1890+ Xmm table_lo = e.xmm1 ;
1891+ if (i.src2 .is_constant ) {
1892+ e.LoadConstantXmm (table_lo, i.src2 .constant ());
1893+ } else {
1894+ table_lo = i.src2 ;
1895+ }
1896+
1897+ Xmm table_hi = e.xmm2 ;
1898+ if (i.src3 .is_constant ) {
1899+ e.LoadConstantXmm (table_hi, i.src3 .constant ());
1900+ } else {
1901+ table_hi = i.src3 ;
1902+ }
1903+
1904+ e.vpermi2w (table_idx, table_lo, table_hi);
1905+ e.vmovdqu8 (i.dest , table_idx);
1906+ return ;
1907+ }
1908+
18291909 assert_true (i.src1 .is_constant );
1910+
18301911 vec128_t perm = (i.src1 .constant () & vec128s (0xF )) ^ vec128s (0x1 );
18311912 vec128_t perm_ctrl = vec128b (0 );
18321913 for (int i = 0 ; i < 8 ; i++) {
0 commit comments