bytecodealliance · sunfishcode · Jul 17, 2019 · Jul 11, 2019 · Jul 11, 2019 · Jul 11, 2019
diff --git a/cranelift-codegen/meta/src/cdsl/instructions.rs b/cranelift-codegen/meta/src/cdsl/instructions.rs
@@ -12,7 +12,7 @@ use crate::cdsl::formats::{
 };
 use crate::cdsl::operands::Operand;
 use crate::cdsl::type_inference::Constraint;
-use crate::cdsl::types::{LaneType, ValueType};
+use crate::cdsl::types::{LaneType, ValueType, VectorType};
 use crate::cdsl::typevar::TypeVar;
 
 #[derive(Copy, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
@@ -176,6 +176,11 @@ impl Instruction {
     pub fn bind(&self, lane_type: impl Into<LaneType>) -> BoundInstruction {
         bind(self.clone(), Some(lane_type.into()), Vec::new())
     }
+
+    pub fn bind_vector(&self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
+        bind_vector(self.clone(), lane_type.into(), num_lanes, Vec::new())
+    }
+
     pub fn bind_any(&self) -> BoundInstruction {
         bind(self.clone(), None, Vec::new())
     }
@@ -400,6 +405,11 @@ impl BoundInstruction {
     pub fn bind(self, lane_type: impl Into<LaneType>) -> BoundInstruction {
         bind(self.inst, Some(lane_type.into()), self.value_types)
     }
+
+    pub fn bind_vector(self, lane_type: impl Into<LaneType>, num_lanes: u64) -> BoundInstruction {
+        bind_vector(self.inst, lane_type.into(), num_lanes, self.value_types)
+    }
+
     pub fn bind_any(self) -> BoundInstruction {
         bind(self.inst, None, self.value_types)
     }
@@ -1062,6 +1072,26 @@ fn bind(
         }
     }
 
+    verify_polymorphic_binding(&inst, &value_types);
+
+    BoundInstruction { inst, value_types }
+}
+
+/// Helper bind for vector types reused by {Bound,}Instruction::bind.
+fn bind_vector(
+    inst: Instruction,
+    lane_type: LaneType,
+    num_lanes: u64,
+    mut value_types: Vec<ValueTypeOrAny>,
+) -> BoundInstruction {
+    let vector_type = ValueType::Vector(VectorType::new(lane_type, num_lanes));
+    value_types.push(ValueTypeOrAny::ValueType(vector_type));
+    verify_polymorphic_binding(&inst, &value_types);
+    BoundInstruction { inst, value_types }
+}
+
+/// Helper to verify that binding types to the instruction does not violate polymorphic rules
+fn verify_polymorphic_binding(inst: &Instruction, value_types: &Vec<ValueTypeOrAny>) {
     match &inst.polymorphic_info {
         Some(poly) => {
             assert!(
@@ -1076,6 +1106,4 @@ fn bind(
             ));
         }
     }
-
-    BoundInstruction { inst, value_types }
 }
diff --git a/cranelift-codegen/meta/src/gen_legalizer.rs b/cranelift-codegen/meta/src/gen_legalizer.rs
@@ -51,7 +51,7 @@ fn unwrap_inst(
 
     fmtln!(
         fmt,
-        "let ({}, predicate) = if let ir::InstructionData::{} {{",
+        "let ({}, predicate) = if let crate::ir::InstructionData::{} {{",
         arg_names,
         iform.name
     );
@@ -135,18 +135,16 @@ fn unwrap_inst(
                     .to_comment_string(var_pool)
             ));
 
-            fmt.line("let results = pos.func.dfg.inst_results(inst);");
+            fmt.line("let r = pos.func.dfg.inst_results(inst);");
             for (i, &var_index) in def.defined_vars.iter().enumerate() {
                 let var = var_pool.get(var_index);
-                fmtln!(fmt, "let {} = &results[{}];", var.name, i);
-                if var.has_free_typevar() {
-                    fmtln!(
-                        fmt,
-                        "let typeof_{} = pos.func.dfg.value_type(*{});",
-                        var.name,
-                        var.name
-                    );
-                }
+                fmtln!(fmt, "let {} = &r[{}];", var.name, i);
+                fmtln!(
+                    fmt,
+                    "let typeof_{} = pos.func.dfg.value_type(*{});",
+                    var.name,
+                    var.name
+                );
             }
 
             replace_inst = true;
@@ -187,11 +185,15 @@ fn build_derived_expr(tv: &TypeVar) -> String {
         Some(base) => base,
         None => {
             assert!(tv.name.starts_with("typeof_"));
-            return format!("{}", tv.name);
+            return format!("Some({})", tv.name);
         }
     };
     let base_expr = build_derived_expr(&base.type_var);
-    format!("{}.{}()", base_expr, base.derived_func.name())
+    format!(
+        "{}.map(|t: crate::ir::Type| t.{}())",
+        base_expr,
+        base.derived_func.name()
+    )
 }
 
 /// Emit rust code for the given check.
@@ -221,18 +223,29 @@ fn emit_runtime_typecheck<'a, 'b>(
         Constraint::Eq(tv1, tv2) => {
             fmtln!(
                 fmt,
-                "let predicate = predicate && {} == {};",
+                "let predicate = predicate && match ({}, {}) {{",
                 build_derived_expr(tv1),
                 build_derived_expr(tv2)
             );
+            fmt.indent(|fmt| {
+                fmt.line("(Some(a), Some(b)) => a == b,");
+                fmt.comment("On overflow, constraint doesn\'t apply");
+                fmt.line("_ => false,");
+            });
+            fmtln!(fmt, "};");
         }
         Constraint::WiderOrEq(tv1, tv2) => {
             fmtln!(
                 fmt,
-                "let predicate = predicate && {}.wider_or_equal({});",
+                "let predicate = predicate && match ({}, {}) {{",
                 build_derived_expr(tv1),
                 build_derived_expr(tv2)
             );
+            fmt.indent(|fmt| {
+                fmt.line("(Some(a), Some(b)) => a.wider_or_equal(b),");
+                fmt.comment("On overflow, constraint doesn\'t apply");
+                fmt.line("_ => false,");
+            });
             fmtln!(fmt, "};");
         }
     }
@@ -290,7 +303,8 @@ fn emit_dst_inst(def: &Def, def_pool: &DefPool, var_pool: &VarPool, fmt: &mut Fo
             // Unwrapping would have left the results intact.  Replace the whole instruction.
             fmtln!(
                 fmt,
-                "pos.func.dfg.replace(inst).{};",
+                "let {} = pos.func.dfg.replace(inst).{};",
+                defined_vars,
                 def.apply.rust_builder(&def.defined_vars, var_pool)
             );
 
@@ -406,16 +420,17 @@ fn gen_transform_group<'a>(
     // Function arguments.
     fmtln!(fmt, "pub fn {}(", group.name);
     fmt.indent(|fmt| {
-        fmt.line("inst: ir::Inst,");
-        fmt.line("func: &mut ir::Function,");
-        fmt.line("cfg: &mut ControlFlowGraph,");
-        fmt.line("isa: &dyn TargetIsa,");
+        fmt.line("inst: crate::ir::Inst,");
+        fmt.line("func: &mut crate::ir::Function,");
+        fmt.line("cfg: &mut crate::flowgraph::ControlFlowGraph,");
+        fmt.line("isa: &dyn crate::isa::TargetIsa,");
     });
     fmtln!(fmt, ") -> bool {");
 
     // Function body.
     fmt.indent(|fmt| {
-        fmt.line("use ir::InstBuilder;");
+        fmt.line("use crate::ir::InstBuilder;");
+        fmt.line("use crate::cursor::{Cursor, FuncCursor};");
         fmt.line("let mut pos = FuncCursor::new(func).at_inst(inst);");
         fmt.line("pos.use_srcloc(inst);");
 

diff --git a/cranelift-codegen/meta/src/isa/x86/encodings.rs b/cranelift-codegen/meta/src/isa/x86/encodings.rs
@@ -9,8 +9,8 @@ use crate::cdsl::instructions::{
 };
 use crate::cdsl::recipes::{EncodingRecipe, EncodingRecipeNumber, Recipes};
 use crate::cdsl::settings::{SettingGroup, SettingPredicateNumber};
-
-use crate::shared::types::Bool::B1;
+use crate::cdsl::types::ValueType;
+use crate::shared::types::Bool::{B1, B16, B32, B64, B8};
 use crate::shared::types::Float::{F32, F64};
 use crate::shared::types::Int::{I16, I32, I64, I8};
 use crate::shared::Definitions as SharedDefinitions;
@@ -250,6 +250,17 @@ impl PerCpuModeEncodings {
             self.enc64(inst.clone().bind(I64).bind_any(), template);
         }
     }
+
+    /// Add the same encoding to both X86_32 and X86_64; assumes configuration (e.g. REX, operand binding) has already happened
+    fn enc_32_64_isap(
+        &mut self,
+        inst: BoundInstruction,
+        template: Template,
+        isap: SettingPredicateNumber,
+    ) {
+        self.enc32_isap(inst.clone(), template.clone(), isap);
+        self.enc64_isap(inst, template, isap);
+    }
 }
 
 // Definitions.
@@ -315,6 +326,7 @@ pub fn define(
     let ifcmp_sp = shared.by_name("ifcmp_sp");
     let imul = shared.by_name("imul");
     let indirect_jump_table_br = shared.by_name("indirect_jump_table_br");
+    let insertlane = shared.by_name("insertlane");
     let ireduce = shared.by_name("ireduce");
     let ishl = shared.by_name("ishl");
     let ishl_imm = shared.by_name("ishl_imm");
@@ -332,6 +344,7 @@ pub fn define(
     let load_complex = shared.by_name("load_complex");
     let nearest = shared.by_name("nearest");
     let popcnt = shared.by_name("popcnt");
+    let raw_bitcast = shared.by_name("raw_bitcast");
     let regfill = shared.by_name("regfill");
     let regmove = shared.by_name("regmove");
     let regspill = shared.by_name("regspill");
@@ -340,6 +353,7 @@ pub fn define(
     let rotl_imm = shared.by_name("rotl_imm");
     let rotr = shared.by_name("rotr");
     let rotr_imm = shared.by_name("rotr_imm");
+    let scalar_to_vector = shared.by_name("scalar_to_vector");
     let selectif = shared.by_name("selectif");
     let sextend = shared.by_name("sextend");
     let sload16 = shared.by_name("sload16");
@@ -377,6 +391,8 @@ pub fn define(
     let x86_fmax = x86.by_name("x86_fmax");
     let x86_fmin = x86.by_name("x86_fmin");
     let x86_pop = x86.by_name("x86_pop");
+    let x86_pshufd = x86.by_name("x86_pshufd");
+    let x86_pshufb = x86.by_name("x86_pshufb");
     let x86_push = x86.by_name("x86_push");
     let x86_sdivmodx = x86.by_name("x86_sdivmodx");
     let x86_smulx = x86.by_name("x86_smulx");
@@ -450,6 +466,7 @@ pub fn define(
     let rec_ldWithIndexDisp8 = r.template("ldWithIndexDisp8");
     let rec_mulx = r.template("mulx");
     let rec_null = r.recipe("null");
+    let rec_null_fpr = r.recipe("null_fpr");
     let rec_pcrel_fnaddr8 = r.template("pcrel_fnaddr8");
     let rec_pcrel_gvaddr8 = r.template("pcrel_gvaddr8");
     let rec_popq = r.template("popq");
@@ -459,6 +476,8 @@ pub fn define(
     let rec_pushq = r.template("pushq");
     let rec_ret = r.template("ret");
     let rec_r_ib = r.template("r_ib");
+    let rec_r_ib_unsigned = r.template("r_ib_unsigned");
+    let rec_r_ib_unsigned_r = r.template("r_ib_unsigned_r");
     let rec_r_id = r.template("r_id");
     let rec_rcmp = r.template("rcmp");
     let rec_rcmp_ib = r.template("rcmp_ib");
@@ -515,6 +534,8 @@ pub fn define(
     let use_popcnt = settings.predicate_by_name("use_popcnt");
     let use_lzcnt = settings.predicate_by_name("use_lzcnt");
     let use_bmi1 = settings.predicate_by_name("use_bmi1");
+    let use_sse2 = settings.predicate_by_name("use_sse2");
+    let use_ssse3 = settings.predicate_by_name("use_ssse3");
     let use_sse41 = settings.predicate_by_name("use_sse41");
 
     // Definitions.
@@ -603,8 +624,14 @@ pub fn define(
     // Finally, the 0xb8 opcode takes an 8-byte immediate with a REX.W prefix.
     e.enc64(iconst.bind(I64), rec_pu_iq.opcodes(vec![0xb8]).rex().w());
 
-    // Bool constants.
-    e.enc_both(bconst.bind(B1), rec_pu_id_bool.opcodes(vec![0xb8]));
+    // Bool constants (uses MOV)
+    for &ty in &[B1, B8, B16, B32] {
+        e.enc_both(bconst.bind(ty), rec_pu_id_bool.opcodes(vec![0xb8]));
+    }
+    e.enc64(
+        bconst.bind(B64),
+        rec_pu_id_bool.opcodes(vec![0xb8]).rex().w(),
+    );
 
     // Shifts and rotates.
     // Note that the dynamic shift amount is only masked by 5 or 6 bits; the 8-bit
@@ -1565,5 +1592,81 @@ pub fn define(
     e.enc_both(ffcmp.bind(F32), rec_fcmp.opcodes(vec![0x0f, 0x2e]));
     e.enc_both(ffcmp.bind(F64), rec_fcmp.opcodes(vec![0x66, 0x0f, 0x2e]));
 
+    // SIMD splat: before x86 can use vector data, it must be moved to XMM registers; see
+    // legalize.rs for how this is done; once there, x86_pshuf* (below) is used for broadcasting the
+    // value across the register
+
+    // PSHUFB, 8-bit shuffle using two XMM registers
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let number_of_lanes = 128 / ty.lane_bits();
+        let instruction = x86_pshufb.bind_vector(ty, number_of_lanes);
+        let template = rec_fa.nonrex().opcodes(vec![0x66, 0x0f, 0x38, 0x00]);
+        e.enc32_isap(instruction.clone(), template.clone(), use_ssse3);
+        e.enc64_isap(instruction, template, use_ssse3);
+    }
+
+    // PSHUFD, 32-bit shuffle using one XMM register and a u8 immediate
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 32) {
+        let number_of_lanes = 128 / ty.lane_bits();
+        let instruction = x86_pshufd.bind_vector(ty, number_of_lanes);
+        let template = rec_r_ib_unsigned.nonrex().opcodes(vec![0x66, 0x0f, 0x70]);
+        e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
+        e.enc64_isap(instruction, template, use_sse2);
+    }
+
+    // SIMD scalar_to_vector; this uses MOV to copy the scalar value to an XMM register; according
+    // to the Intel manual: "When the destination operand is an XMM register, the source operand is
+    // written to the low doubleword of the register and the regiser is zero-extended to 128 bits."
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
+        let number_of_lanes = 128 / ty.lane_bits();
+        let instruction = scalar_to_vector.bind_vector(ty, number_of_lanes).bind(ty);
+        let template = rec_frurm.opcodes(vec![0x66, 0x0f, 0x6e]); // MOVD/MOVQ
+        if ty.lane_bits() < 64 {
+            // no 32-bit encodings for 64-bit widths
+            e.enc32_isap(instruction.clone(), template.clone(), use_sse2);
+        }
+        e.enc_x86_64_isap(instruction, template, use_sse2);
+    }
+
+    // SIMD insertlane
+    let mut insertlane_mapping: HashMap<u64, (Vec<u8>, SettingPredicateNumber)> = HashMap::new();
+    insertlane_mapping.insert(8, (vec![0x66, 0x0f, 0x3a, 0x20], use_sse41)); // PINSRB
+    insertlane_mapping.insert(16, (vec![0x66, 0x0f, 0xc4], use_sse2)); // PINSRW
+    insertlane_mapping.insert(32, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRD
+    insertlane_mapping.insert(64, (vec![0x66, 0x0f, 0x3a, 0x22], use_sse41)); // PINSRQ, only x86_64
+
+    for ty in ValueType::all_lane_types() {
+        if let Some((opcode, isap)) = insertlane_mapping.get(&ty.lane_bits()) {
+            let number_of_lanes = 128 / ty.lane_bits();
+            let instruction = insertlane.bind_vector(ty, number_of_lanes);
+            let template = rec_r_ib_unsigned_r.opcodes(opcode.clone());
+            if ty.lane_bits() < 64 {
+                e.enc_32_64_isap(instruction, template.nonrex(), isap.clone());
+            } else {
+                // turns out the 64-bit widths have REX/W encodings and only are available on x86_64
+                e.enc64_isap(instruction, template.rex().w(), isap.clone());
+            }
+        }
+    }
+
+    // SIMD bitcast f64 to all 8-bit-lane vectors (for legalizing splat.x8x16); assumes that f64 is stored in an XMM register
+    for ty in ValueType::all_lane_types().filter(|t| t.lane_bits() == 8) {
+        let instruction = bitcast.bind_vector(ty, 16).bind(F64);
+        e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
+        e.enc64_rec(instruction, rec_null_fpr, 0);
+    }
+
+    // SIMD bitcast all 128-bit vectors to each other (for legalizing splat.x16x8)
+    for from_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8) {
+        for to_type in ValueType::all_lane_types().filter(|t| t.lane_bits() >= 8 && *t != from_type)
+        {
+            let instruction = raw_bitcast
+                .bind_vector(to_type, 128 / to_type.lane_bits())
+                .bind_vector(from_type, 128 / from_type.lane_bits());
+            e.enc32_rec(instruction.clone(), rec_null_fpr, 0);
+            e.enc64_rec(instruction, rec_null_fpr, 0);
+        }
+    }
+
     e
 }