emit_x64_vector_floating_point: Use deferred emits

This commit is contained in:
Merry 2022-07-13 10:52:06 +01:00 committed by merry
parent 7d5e078baa
commit 36f6114559
3 changed files with 61 additions and 59 deletions

View File

@ -32,6 +32,8 @@ using namespace Xbyak::util;
EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block) EmitContext::EmitContext(RegAlloc& reg_alloc, IR::Block& block)
: reg_alloc(reg_alloc), block(block) {} : reg_alloc(reg_alloc), block(block) {}
EmitContext::~EmitContext() = default;
size_t EmitContext::GetInstOffset(IR::Inst* inst) const { size_t EmitContext::GetInstOffset(IR::Inst* inst) const {
return static_cast<size_t>(std::distance(block.begin(), IR::Block::iterator(inst))); return static_cast<size_t>(std::distance(block.begin(), IR::Block::iterator(inst)));
} }

View File

@ -51,6 +51,7 @@ using HalfVectorArray = std::array<T, A64FullVectorWidth::value / mcl::bitsizeof
struct EmitContext { struct EmitContext {
EmitContext(RegAlloc& reg_alloc, IR::Block& block); EmitContext(RegAlloc& reg_alloc, IR::Block& block);
virtual ~EmitContext();
size_t GetInstOffset(IR::Inst* inst) const; size_t GetInstOffset(IR::Inst* inst) const;
void EraseInstruction(IR::Inst* inst); void EraseInstruction(IR::Inst* inst);

View File

@ -112,14 +112,13 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
code.cmp(bitmask, 0); code.cmp(bitmask, 0);
} }
Xbyak::Label end; SharedLabel end = GenSharedLabel(), nan = GenSharedLabel();
Xbyak::Label nan;
code.jnz(nan, code.T_NEAR); code.jnz(*nan, code.T_NEAR);
code.L(end); code.L(*end);
code.SwitchToFarCode(); ctx.deferred_emits.emplace_back([=, &code, &ctx] {
code.L(nan); code.L(*nan);
const Xbyak::Xmm result = xmms[0]; const Xbyak::Xmm result = xmms[0];
@ -127,7 +126,7 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
const size_t stack_space = xmms.size() * 16; const size_t stack_space = xmms.size() * 16;
code.sub(rsp, stack_space + ABI_SHADOW_SPACE); code.sub(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE));
for (size_t i = 0; i < xmms.size(); ++i) { for (size_t i = 0; i < xmms.size(); ++i) {
code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]); code.movaps(xword[rsp + ABI_SHADOW_SPACE + i * 16], xmms[i]);
} }
@ -137,11 +136,11 @@ void HandleNaNs(BlockOfCode& code, EmitContext& ctx, bool fpcr_controlled, std::
code.CallFunction(nan_handler); code.CallFunction(nan_handler);
code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]); code.movaps(result, xword[rsp + ABI_SHADOW_SPACE + 0 * 16]);
code.add(rsp, stack_space + ABI_SHADOW_SPACE); code.add(rsp, static_cast<u32>(stack_space + ABI_SHADOW_SPACE));
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8); code.add(rsp, 8);
code.jmp(end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
code.SwitchToNearCode(); });
} }
template<size_t fsize> template<size_t fsize>
@ -1117,7 +1116,7 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]); const Xbyak::Xmm xmm_c = ctx.reg_alloc.UseXmm(args[2]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
Xbyak::Label end, fallback; SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, xmm_a); code.movaps(result, xmm_a);
@ -1127,19 +1126,19 @@ void EmitFPVectorMulAdd(BlockOfCode& code, EmitContext& ctx, IR::Inst* inst) {
code.andnps(tmp, result); code.andnps(tmp, result);
FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code)); FCODE(vcmpeq_uqp)(tmp, tmp, GetSmallestNormalVector<fsize>(code));
code.vptest(tmp, tmp); code.vptest(tmp, tmp);
code.jnz(fallback, code.T_NEAR); code.jnz(*fallback, code.T_NEAR);
code.L(end); code.L(*end);
}); });
code.SwitchToFarCode(); ctx.deferred_emits.emplace_back([=, &code, &ctx] {
code.L(fallback); code.L(*fallback);
code.sub(rsp, 8); code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn, fpcr_controlled); EmitFourOpFallbackWithoutRegAlloc(code, ctx, result, xmm_a, xmm_b, xmm_c, fallback_fn, fpcr_controlled);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8); code.add(rsp, 8);
code.jmp(end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
code.SwitchToNearCode(); });
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
return; return;
@ -1377,7 +1376,7 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]); const Xbyak::Xmm operand2 = ctx.reg_alloc.UseXmm(args[1]);
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
Xbyak::Label end, fallback; SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code)); code.movaps(result, GetVectorOf<fsize, false, 0, 2>(code));
@ -1385,19 +1384,19 @@ static void EmitRecipStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
FCODE(vcmpunordp)(tmp, result, result); FCODE(vcmpunordp)(tmp, result, result);
code.vptest(tmp, tmp); code.vptest(tmp, tmp);
code.jnz(fallback, code.T_NEAR); code.jnz(*fallback, code.T_NEAR);
code.L(end); code.L(*end);
}); });
code.SwitchToFarCode(); ctx.deferred_emits.emplace_back([=, &code, &ctx] {
code.L(fallback); code.L(*fallback);
code.sub(rsp, 8); code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled); EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8); code.add(rsp, 8);
code.jmp(end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
code.SwitchToNearCode(); });
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
return; return;
@ -1591,7 +1590,7 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm tmp = ctx.reg_alloc.ScratchXmm();
const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm(); const Xbyak::Xmm mask = ctx.reg_alloc.ScratchXmm();
Xbyak::Label end, fallback; SharedLabel end = GenSharedLabel(), fallback = GenSharedLabel();
MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] { MaybeStandardFPSCRValue(code, ctx, fpcr_controlled, [&] {
code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code)); code.vmovaps(result, GetVectorOf<fsize, false, 0, 3>(code));
@ -1602,21 +1601,21 @@ static void EmitRSqrtStepFused(BlockOfCode& code, EmitContext& ctx, IR::Inst* in
FCODE(vandp)(tmp, result, mask); FCODE(vandp)(tmp, result, mask);
ICODE(vpcmpeq)(tmp, tmp, mask); ICODE(vpcmpeq)(tmp, tmp, mask);
code.ptest(tmp, tmp); code.ptest(tmp, tmp);
code.jnz(fallback, code.T_NEAR); code.jnz(*fallback, code.T_NEAR);
FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code)); FCODE(vmulp)(result, result, GetVectorOf<fsize, false, -1, 1>(code));
code.L(end); code.L(*end);
}); });
code.SwitchToFarCode(); ctx.deferred_emits.emplace_back([=, &code, &ctx] {
code.L(fallback); code.L(*fallback);
code.sub(rsp, 8); code.sub(rsp, 8);
ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PushCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled); EmitThreeOpFallbackWithoutRegAlloc(code, ctx, result, operand1, operand2, fallback_fn, fpcr_controlled);
ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx())); ABI_PopCallerSaveRegistersAndAdjustStackExcept(code, HostLocXmmIdx(result.getIdx()));
code.add(rsp, 8); code.add(rsp, 8);
code.jmp(end, code.T_NEAR); code.jmp(*end, code.T_NEAR);
code.SwitchToNearCode(); });
ctx.reg_alloc.DefineValue(inst, result); ctx.reg_alloc.DefineValue(inst, result);
return; return;