From 05e97058c32fab0e0ebf88e7fff2a4b77890e206 Mon Sep 17 00:00:00 2001 From: MerryMage Date: Fri, 24 Mar 2017 15:56:24 +0000 Subject: [PATCH] parallel: Add and Subtract with Exchange improvements * Remove asx argument from PackedHalvingSubAdd{U16,S16} IR instruction * Implement Packed{Halving,}{AddSub,SubAdd}{U16,S16} IR instructions * Implement SASX, SSAX, UASX, USAX --- src/backend_x64/emit_x64.cpp | 90 ++++++++++++++----- src/frontend/ir/ir_emitter.cpp | 40 ++++++++- src/frontend/ir/ir_emitter.h | 10 ++- src/frontend/ir/opcodes.inc | 10 ++- .../translate/translate_arm/parallel.cpp | 48 +++++++--- 5 files changed, 157 insertions(+), 41 deletions(-) diff --git a/src/backend_x64/emit_x64.cpp b/src/backend_x64/emit_x64.cpp index 3cce95a6..5a3880e9 100644 --- a/src/backend_x64/emit_x64.cpp +++ b/src/backend_x64/emit_x64.cpp @@ -1986,17 +1986,15 @@ void EmitX64::EmitPackedHalvingSubS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* reg_alloc.DefineValue(inst, minuend); } -void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, bool is_signed) { +void EmitPackedSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst, bool hi_is_sum, bool is_signed, bool is_halving) { auto args = reg_alloc.GetArgumentInfo(inst); + auto ge_inst = inst->GetAssociatedPseudoOperation(IR::Opcode::GetGEFromOp); Xbyak::Reg32 reg_a_hi = reg_alloc.UseScratchGpr(args[0]).cvt32(); Xbyak::Reg32 reg_b_hi = reg_alloc.UseScratchGpr(args[1]).cvt32(); Xbyak::Reg32 reg_a_lo = reg_alloc.ScratchGpr().cvt32(); Xbyak::Reg32 reg_b_lo = reg_alloc.ScratchGpr().cvt32(); - - // If asx is true, the high word contains the sum and the low word the difference. - // If false, the high word contains the difference and the low word the sum. - bool asx = args[2].GetImmediateU1(); + Xbyak::Reg32 reg_sum, reg_diff; if (is_signed) { code->movsx(reg_a_lo, reg_a_hi.cvt16()); @@ -2010,22 +2008,48 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i code->shr(reg_b_hi, 16); } - if (asx) { - // Calculate diff such that reg_a_lo<31:16> contains diff<16:1>. + if (hi_is_sum) { code->sub(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate sum such that reg_a_hi<15:0> contains sum<16:1>. code->add(reg_a_hi, reg_b_lo); + reg_diff = reg_a_lo; + reg_sum = reg_a_hi; + } else { + code->add(reg_a_lo, reg_b_hi); + code->sub(reg_a_hi, reg_b_lo); + reg_diff = reg_a_hi; + reg_sum = reg_a_lo; + } + + if (ge_inst) { + EraseInstruction(block, ge_inst); + + // The reg_b registers are no longer required. + Xbyak::Reg32 ge_sum = reg_b_hi; + Xbyak::Reg32 ge_diff = reg_b_lo; + + code->mov(ge_sum, reg_sum); + code->mov(ge_diff, reg_diff); + + if (!is_signed) { + code->shl(ge_sum, 15); + code->sar(ge_sum, 16); + } else { + code->not(ge_sum); + } + code->not(ge_diff); + code->and(ge_sum, hi_is_sum ? 0xC0000000 : 0x30000000); + code->and(ge_diff, hi_is_sum ? 0x30000000 : 0xC0000000); + code->or_(ge_sum, ge_diff); + code->shr(ge_sum, 28); + + reg_alloc.DefineValue(ge_inst, ge_sum); + } + + if (is_halving) { + code->shl(reg_a_lo, 15); code->shr(reg_a_hi, 1); } else { - // Calculate sum such that reg_a_lo<31:16> contains sum<16:1>. - code->add(reg_a_lo, reg_b_hi); - code->shl(reg_a_lo, 15); - - // Calculate diff such that reg_a_hi<15:0> contains diff<16:1>. - code->sub(reg_a_hi, reg_b_lo); - code->shr(reg_a_hi, 1); + code->shl(reg_a_lo, 16); } // reg_a_lo now contains the low word and reg_a_hi now contains the high word. @@ -2035,12 +2059,36 @@ void EmitPackedHalvingSubAdd(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* i reg_alloc.DefineValue(inst, reg_a_hi); } -void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - EmitPackedHalvingSubAdd(code, reg_alloc, inst, false); +void EmitX64::EmitPackedAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, false); } -void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block&, IR::Inst* inst) { - EmitPackedHalvingSubAdd(code, reg_alloc, inst, true); +void EmitX64::EmitPackedAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, false); +} + +void EmitX64::EmitPackedSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, false); +} + +void EmitX64::EmitPackedSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, false); +} + +void EmitX64::EmitPackedHalvingAddSubU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, true, false, true); +} + +void EmitX64::EmitPackedHalvingAddSubS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, true, true, true); +} + +void EmitX64::EmitPackedHalvingSubAddU16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, false, false, true); +} + +void EmitX64::EmitPackedHalvingSubAddS16(RegAlloc& reg_alloc, IR::Block& block, IR::Inst* inst) { + EmitPackedSubAdd(code, reg_alloc, block, inst, false, true, true); } static void EmitPackedOperation(BlockOfCode* code, RegAlloc& reg_alloc, IR::Inst* inst, void (Xbyak::CodeGenerator::*fn)(const Xbyak::Mmx& mmx, const Xbyak::Operand&)) { diff --git a/src/frontend/ir/ir_emitter.cpp b/src/frontend/ir/ir_emitter.cpp index 2bd7d864..fa4caaf9 100644 --- a/src/frontend/ir/ir_emitter.cpp +++ b/src/frontend/ir/ir_emitter.cpp @@ -402,6 +402,30 @@ IREmitter::ResultAndGE IREmitter::PackedSubS16(const Value& a, const Value& b) { return {result, ge}; } +IREmitter::ResultAndGE IREmitter::PackedAddSubU16(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedAddSubU16, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + +IREmitter::ResultAndGE IREmitter::PackedAddSubS16(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedAddSubS16, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + +IREmitter::ResultAndGE IREmitter::PackedSubAddU16(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedSubAddU16, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + +IREmitter::ResultAndGE IREmitter::PackedSubAddS16(const Value& a, const Value& b) { + auto result = Inst(Opcode::PackedSubAddS16, {a, b}); + auto ge = Inst(Opcode::GetGEFromOp, {result}); + return {result, ge}; +} + Value IREmitter::PackedHalvingAddU8(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingAddU8, {a, b}); } @@ -434,12 +458,20 @@ Value IREmitter::PackedHalvingSubS16(const Value& a, const Value& b) { return Inst(Opcode::PackedHalvingSubS16, {a, b}); } -Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx) { - return Inst(Opcode::PackedHalvingSubAddU16, {a, b, Imm1(asx)}); +Value IREmitter::PackedHalvingAddSubU16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingAddSubU16, {a, b}); } -Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx) { - return Inst(Opcode::PackedHalvingSubAddS16, {a, b, Imm1(asx)}); +Value IREmitter::PackedHalvingAddSubS16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingAddSubS16, {a, b}); +} + +Value IREmitter::PackedHalvingSubAddU16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingSubAddU16, {a, b}); +} + +Value IREmitter::PackedHalvingSubAddS16(const Value& a, const Value& b) { + return Inst(Opcode::PackedHalvingSubAddS16, {a, b}); } Value IREmitter::PackedSaturatedAddU8(const Value& a, const Value& b) { diff --git a/src/frontend/ir/ir_emitter.h b/src/frontend/ir/ir_emitter.h index e2f701f2..be341e0e 100644 --- a/src/frontend/ir/ir_emitter.h +++ b/src/frontend/ir/ir_emitter.h @@ -149,6 +149,10 @@ public: ResultAndGE PackedSubS8(const Value& a, const Value& b); ResultAndGE PackedSubU16(const Value& a, const Value& b); ResultAndGE PackedSubS16(const Value& a, const Value& b); + ResultAndGE PackedAddSubU16(const Value& a, const Value& b); + ResultAndGE PackedAddSubS16(const Value& a, const Value& b); + ResultAndGE PackedSubAddU16(const Value& a, const Value& b); + ResultAndGE PackedSubAddS16(const Value& a, const Value& b); Value PackedHalvingAddU8(const Value& a, const Value& b); Value PackedHalvingAddS8(const Value& a, const Value& b); Value PackedHalvingSubU8(const Value& a, const Value& b); @@ -157,8 +161,10 @@ public: Value PackedHalvingAddS16(const Value& a, const Value& b); Value PackedHalvingSubU16(const Value& a, const Value& b); Value PackedHalvingSubS16(const Value& a, const Value& b); - Value PackedHalvingSubAddU16(const Value& a, const Value& b, bool asx); - Value PackedHalvingSubAddS16(const Value& a, const Value& b, bool asx); + Value PackedHalvingAddSubU16(const Value& a, const Value& b); + Value PackedHalvingAddSubS16(const Value& a, const Value& b); + Value PackedHalvingSubAddU16(const Value& a, const Value& b); + Value PackedHalvingSubAddS16(const Value& a, const Value& b); Value PackedSaturatedAddU8(const Value& a, const Value& b); Value PackedSaturatedAddS8(const Value& a, const Value& b); Value PackedSaturatedSubU8(const Value& a, const Value& b); diff --git a/src/frontend/ir/opcodes.inc b/src/frontend/ir/opcodes.inc index d6339799..edf8cb6a 100644 --- a/src/frontend/ir/opcodes.inc +++ b/src/frontend/ir/opcodes.inc @@ -90,6 +90,10 @@ OPCODE(PackedAddU16, T::U32, T::U32, T::U32 OPCODE(PackedAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedSubU16, T::U32, T::U32, T::U32 ) OPCODE(PackedSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedSubAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU8, T::U32, T::U32, T::U32 ) @@ -98,8 +102,10 @@ OPCODE(PackedHalvingAddU16, T::U32, T::U32, T::U32 OPCODE(PackedHalvingAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubU16, T::U32, T::U32, T::U32 ) OPCODE(PackedHalvingSubS16, T::U32, T::U32, T::U32 ) -OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32, T::U1 ) -OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32, T::U1 ) +OPCODE(PackedHalvingAddSubU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingAddSubS16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddU16, T::U32, T::U32, T::U32 ) +OPCODE(PackedHalvingSubAddS16, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddU8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedAddS8, T::U32, T::U32, T::U32 ) OPCODE(PackedSaturatedSubU8, T::U32, T::U32, T::U32 ) diff --git a/src/frontend/translate/translate_arm/parallel.cpp b/src/frontend/translate/translate_arm/parallel.cpp index 9debd6d0..df8ce4c4 100644 --- a/src/frontend/translate/translate_arm/parallel.cpp +++ b/src/frontend/translate/translate_arm/parallel.cpp @@ -33,13 +33,25 @@ bool ArmTranslatorVisitor::arm_SADD16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_SASX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedAddSubS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_SSAX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedSubAddS16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_SSUB8(Cond cond, Reg n, Reg d, Reg m) { @@ -87,13 +99,25 @@ bool ArmTranslatorVisitor::arm_UADD16(Cond cond, Reg n, Reg d, Reg m) { } bool ArmTranslatorVisitor::arm_UASX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedAddSubU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_USAX(Cond cond, Reg n, Reg d, Reg m) { - UNUSED(cond, n, d, m); - return InterpretThisInstruction(); + if (d == Reg::PC || n == Reg::PC || m == Reg::PC) + return UnpredictableInstruction(); + if (ConditionPassed(cond)) { + auto result = ir.PackedSubAddU16(ir.GetRegister(n), ir.GetRegister(m)); + ir.SetRegister(d, result.result); + ir.SetGEFlags(result.ge); + } + return true; } bool ArmTranslatorVisitor::arm_USAD8(Cond cond, Reg d, Reg m, Reg n) { @@ -261,7 +285,7 @@ bool ArmTranslatorVisitor::arm_SHASX(Cond cond, Reg n, Reg d, Reg m) { if (d == Reg::PC || n == Reg::PC || m == Reg::PC) return UnpredictableInstruction(); if (ConditionPassed(cond)) { - auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), true); + auto result = ir.PackedHalvingAddSubS16(ir.GetRegister(n), ir.GetRegister(m)); ir.SetRegister(d, result); } return true; @@ -271,7 +295,7 @@ bool ArmTranslatorVisitor::arm_SHSAX(Cond cond, Reg n, Reg d, Reg m) { if (d == Reg::PC || n == Reg::PC || m == Reg::PC) return UnpredictableInstruction(); if (ConditionPassed(cond)) { - auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m), false); + auto result = ir.PackedHalvingSubAddS16(ir.GetRegister(n), ir.GetRegister(m)); ir.SetRegister(d, result); } return true; @@ -321,7 +345,7 @@ bool ArmTranslatorVisitor::arm_UHASX(Cond cond, Reg n, Reg d, Reg m) { if (d == Reg::PC || n == Reg::PC || m == Reg::PC) return UnpredictableInstruction(); if (ConditionPassed(cond)) { - auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), true); + auto result = ir.PackedHalvingAddSubU16(ir.GetRegister(n), ir.GetRegister(m)); ir.SetRegister(d, result); } return true; @@ -331,7 +355,7 @@ bool ArmTranslatorVisitor::arm_UHSAX(Cond cond, Reg n, Reg d, Reg m) { if (d == Reg::PC || n == Reg::PC || m == Reg::PC) return UnpredictableInstruction(); if (ConditionPassed(cond)) { - auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m), false); + auto result = ir.PackedHalvingSubAddU16(ir.GetRegister(n), ir.GetRegister(m)); ir.SetRegister(d, result); } return true;