@@ -1517,3 +1517,203 @@ uint64x2_t test_splatq_laneq_u64(uint64x2_t v) {
15171517 // LLVM: [[RES:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> [[TMP1]], <2 x i32> zeroinitializer
15181518 // LLVM: ret <2 x i64> [[RES]]
15191519}
1520+
1521+ int16x4_t test_vpadal_s8 (int16x4_t a , int8x8_t b ) {
1522+ return vpadal_s8 (a , b );
1523+
1524+ // CIR-LABEL: vpadal_s8
1525+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1526+ // CIR-SAME: (!cir.vector<!s8i x 8>) -> !cir.vector<!s16i x 4>
1527+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!s16i x 4>
1528+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s16i x 4>
1529+
1530+ // LLVM: {{.*}}test_vpadal_s8(<4 x i16>{{.*}}[[a:%.*]], <8 x i8>{{.*}}[[b:%.*]])
1531+ // LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[a]] to <8 x i8>
1532+ // LLVM: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> [[b]])
1533+ // LLVM: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], [[a]]
1534+ // LLVM: ret <4 x i16> [[TMP1]]
1535+ }
1536+
1537+ int32x2_t test_vpadal_s16 (int32x2_t a , int16x4_t b ) {
1538+ return vpadal_s16 (a , b );
1539+
1540+ // CIR-LABEL: vpadal_s16
1541+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1542+ // CIR-SAME: (!cir.vector<!s16i x 4>) -> !cir.vector<!s32i x 2>
1543+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!s32i x 2>
1544+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s32i x 2>
1545+
1546+ // LLVM: {{.*}}test_vpadal_s16(<2 x i32>{{.*}}[[a:%.*]], <4 x i16>{{.*}}[[b:%.*]])
1547+ // LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[a]] to <8 x i8>
1548+ // LLVM: [[TMP1:%.*]] = bitcast <4 x i16> [[b]] to <8 x i8>
1549+ // LLVM: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> [[b]])
1550+ // LLVM: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], [[a]]
1551+ // LLVM: ret <2 x i32> [[TMP2]]
1552+ }
1553+
1554+ int64x1_t test_vpadal_s32 (int64x1_t a , int32x2_t b ) {
1555+ return vpadal_s32 (a , b );
1556+
1557+ // CIR-LABEL: vpadal_s32
1558+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1559+ // CIR-SAME: (!cir.vector<!s32i x 2>) -> !cir.vector<!s64i x 1>
1560+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!s64i x 1>
1561+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s64i x 1>
1562+
1563+ // LLVM: {{.*}}test_vpadal_s32(<1 x i64>{{.*}}[[a:%.*]], <2 x i32>{{.*}}[[b:%.*]])
1564+ // LLVM: [[TMP0:%.*]] = bitcast <1 x i64> [[a]] to <8 x i8>
1565+ // LLVM: [[TMP1:%.*]] = bitcast <2 x i32> [[b]] to <8 x i8>
1566+ // LLVM: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> [[b]])
1567+ // LLVM: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], [[a]]
1568+ // LLVM: ret <1 x i64> [[TMP2]]
1569+ }
1570+
1571+ uint16x4_t test_vpadal_u8 (uint16x4_t a , uint8x8_t b ) {
1572+ return vpadal_u8 (a , b );
1573+
1574+ // CIR-LABEL: vpadal_u8
1575+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1576+ // CIR-SAME: (!cir.vector<!u8i x 8>) -> !cir.vector<!u16i x 4>
1577+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!u16i x 4>
1578+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u16i x 4>
1579+
1580+ // LLVM: {{.*}}test_vpadal_u8(<4 x i16>{{.*}}[[a:%.*]], <8 x i8>{{.*}}[[b:%.*]])
1581+ // LLVM: [[TMP0:%.*]] = bitcast <4 x i16> [[a]] to <8 x i8>
1582+ // LLVM: [[VPADAL_I:%.*]] = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> [[b]])
1583+ // LLVM: [[TMP1:%.*]] = add <4 x i16> [[VPADAL_I]], [[a]]
1584+ // LLVM: ret <4 x i16> [[TMP1]]
1585+ }
1586+
1587+ uint32x2_t test_vpadal_u16 (uint32x2_t a , uint16x4_t b ) {
1588+ return vpadal_u16 (a , b );
1589+
1590+ // CIR-LABEL: vpadal_u16
1591+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1592+ // CIR-SAME: (!cir.vector<!u16i x 4>) -> !cir.vector<!u32i x 2>
1593+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!u32i x 2>
1594+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u32i x 2>
1595+
1596+ // LLVM: {{.*}}test_vpadal_u16(<2 x i32>{{.*}}[[a:%.*]], <4 x i16>{{.*}}[[b:%.*]])
1597+ // LLVM: [[TMP0:%.*]] = bitcast <2 x i32> [[a]] to <8 x i8>
1598+ // LLVM: [[TMP1:%.*]] = bitcast <4 x i16> [[b]] to <8 x i8>
1599+ // LLVM: [[VPADAL1_I:%.*]] = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> [[b]])
1600+ // LLVM: [[TMP2:%.*]] = add <2 x i32> [[VPADAL1_I]], [[a]]
1601+ // LLVM: ret <2 x i32> [[TMP2]]
1602+ }
1603+
1604+ uint64x1_t test_vpadal_u32 (uint64x1_t a , uint32x2_t b ) {
1605+ return vpadal_u32 (a , b );
1606+
1607+ // CIR-LABEL: vpadal_u32
1608+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1609+ // CIR-SAME: (!cir.vector<!u32i x 2>) -> !cir.vector<!u64i x 1>
1610+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 8>), !cir.vector<!u64i x 1>
1611+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u64i x 1>
1612+
1613+ // LLVM: {{.*}}test_vpadal_u32(<1 x i64>{{.*}}[[a:%.*]], <2 x i32>{{.*}}[[b:%.*]])
1614+ // LLVM: [[TMP0:%.*]] = bitcast <1 x i64> [[a]] to <8 x i8>
1615+ // LLVM: [[TMP1:%.*]] = bitcast <2 x i32> [[b]] to <8 x i8>
1616+ // LLVM: [[VPADAL1_I:%.*]] = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> [[b]])
1617+ // LLVM: [[TMP2:%.*]] = add <1 x i64> [[VPADAL1_I]], [[a]]
1618+ // LLVM: ret <1 x i64> [[TMP2]]
1619+ }
1620+
1621+ int16x8_t test_vpadalq_s8 (int16x8_t a , int8x16_t b ) {
1622+ return vpadalq_s8 (a , b );
1623+
1624+ // CIR-LABEL: vpadalq_s8
1625+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1626+ // CIR-SAME: (!cir.vector<!s8i x 16>) -> !cir.vector<!s16i x 8>
1627+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s16i x 8>
1628+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s16i x 8>
1629+
1630+ // LLVM: {{.*}}test_vpadalq_s8(<8 x i16>{{.*}}[[a:%.*]], <16 x i8>{{.*}}[[b:%.*]])
1631+ // LLVM: [[TMP0:%.*]] = bitcast <8 x i16> [[a]] to <16 x i8>
1632+ // LLVM: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> [[b]])
1633+ // LLVM: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], [[a]]
1634+ // LLVM: ret <8 x i16> [[TMP1]]
1635+ }
1636+
1637+ int32x4_t test_vpadalq_s16 (int32x4_t a , int16x8_t b ) {
1638+ return vpadalq_s16 (a , b );
1639+
1640+ // CIR-LABEL: vpadalq_s16
1641+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1642+ // CIR-SAME: (!cir.vector<!s16i x 8>) -> !cir.vector<!s32i x 4>
1643+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s32i x 4>
1644+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s32i x 4>
1645+
1646+ // LLVM: {{.*}}test_vpadalq_s16(<4 x i32>{{.*}}[[a:%.*]], <8 x i16>{{.*}}[[b:%.*]])
1647+ // LLVM: [[TMP0:%.*]] = bitcast <4 x i32> [[a]] to <16 x i8>
1648+ // LLVM: [[TMP1:%.*]] = bitcast <8 x i16> [[b]] to <16 x i8>
1649+ // LLVM: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> [[b]])
1650+ // LLVM: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], [[a]]
1651+ // LLVM: ret <4 x i32> [[TMP2]]
1652+ }
1653+
1654+ int64x2_t test_vpadalq_s32 (int64x2_t a , int32x4_t b ) {
1655+ return vpadalq_s32 (a , b );
1656+
1657+ // CIR-LABEL: vpadalq_s32
1658+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.saddlp" {{%.*}} :
1659+ // CIR-SAME: (!cir.vector<!s32i x 4>) -> !cir.vector<!s64i x 2>
1660+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!s64i x 2>
1661+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!s64i x 2>
1662+
1663+ // LLVM: {{.*}}test_vpadalq_s32(<2 x i64>{{.*}}[[a:%.*]], <4 x i32>{{.*}}[[b:%.*]])
1664+ // LLVM: [[TMP0:%.*]] = bitcast <2 x i64> [[a]] to <16 x i8>
1665+ // LLVM: [[TMP1:%.*]] = bitcast <4 x i32> [[b]] to <16 x i8>
1666+ // LLVM: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> [[b]])
1667+ // LLVM: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], [[a]]
1668+ // LLVM: ret <2 x i64> [[TMP2]]
1669+ }
1670+
1671+ uint16x8_t test_vpadalq_u8 (uint16x8_t a , uint8x16_t b ) {
1672+ return vpadalq_u8 (a , b );
1673+
1674+ // CIR-LABEL: vpadalq_u8
1675+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1676+ // CIR-SAME: (!cir.vector<!u8i x 16>) -> !cir.vector<!u16i x 8>
1677+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!u16i x 8>
1678+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u16i x 8>
1679+
1680+ // LLVM: {{.*}}test_vpadalq_u8(<8 x i16>{{.*}}[[a:%.*]], <16 x i8>{{.*}}[[b:%.*]])
1681+ // LLVM: [[TMP0:%.*]] = bitcast <8 x i16> [[a]] to <16 x i8>
1682+ // LLVM: [[VPADAL_I:%.*]] = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> [[b]])
1683+ // LLVM: [[TMP1:%.*]] = add <8 x i16> [[VPADAL_I]], [[a]]
1684+ // LLVM: ret <8 x i16> [[TMP1]]
1685+ }
1686+
1687+ uint32x4_t test_vpadalq_u16 (uint32x4_t a , uint16x8_t b ) {
1688+ return vpadalq_u16 (a , b );
1689+
1690+ // CIR-LABEL: vpadalq_u16
1691+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1692+ // CIR-SAME: (!cir.vector<!u16i x 8>) -> !cir.vector<!u32i x 4>
1693+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!u32i x 4>
1694+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u32i x 4>
1695+
1696+ // LLVM: {{.*}}test_vpadalq_u16(<4 x i32>{{.*}}[[a:%.*]], <8 x i16>{{.*}}[[b:%.*]])
1697+ // LLVM: [[TMP0:%.*]] = bitcast <4 x i32> [[a]] to <16 x i8>
1698+ // LLVM: [[TMP1:%.*]] = bitcast <8 x i16> [[b]] to <16 x i8>
1699+ // LLVM: [[VPADAL1_I:%.*]] = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> [[b]])
1700+ // LLVM: [[TMP2:%.*]] = add <4 x i32> [[VPADAL1_I]], [[a]]
1701+ // LLVM: ret <4 x i32> [[TMP2]]
1702+ }
1703+
1704+ uint64x2_t test_vpadalq_u32 (uint64x2_t a , uint32x4_t b ) {
1705+ return vpadalq_u32 (a , b );
1706+
1707+ // CIR-LABEL: vpadalq_u32
1708+ // CIR: [[VPADAL_I:%.*]] = cir.llvm.intrinsic "aarch64.neon.uaddlp" {{%.*}} :
1709+ // CIR-SAME: (!cir.vector<!u32i x 4>) -> !cir.vector<!u64i x 2>
1710+ // CIR: [[a:%.*]] = cir.cast(bitcast, {{%.*}} : !cir.vector<!s8i x 16>), !cir.vector<!u64i x 2>
1711+ // CIR: {{%.*}} = cir.binop(add, [[VPADAL_I]], [[a]]) : !cir.vector<!u64i x 2>
1712+
1713+ // LLVM: {{.*}}test_vpadalq_u32(<2 x i64>{{.*}}[[a:%.*]], <4 x i32>{{.*}}[[b:%.*]])
1714+ // LLVM: [[TMP0:%.*]] = bitcast <2 x i64> [[a]] to <16 x i8>
1715+ // LLVM: [[TMP1:%.*]] = bitcast <4 x i32> [[b]] to <16 x i8>
1716+ // LLVM: [[VPADAL1_I:%.*]] = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> [[b]])
1717+ // LLVM: [[TMP2:%.*]] = add <2 x i64> [[VPADAL1_I]], [[a]]
1718+ // LLVM: ret <2 x i64> [[TMP2]]
1719+ }
0 commit comments