From 6367a26e62bc649b2e99c2a7e1a6d8c8771e6642 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sat, 18 Jun 2022 00:02:59 -0700 Subject: [PATCH] emit_x64_{vector_}floating_point: Add AVX512 implementation for `DenormalsAreZero` MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both single and double precision floating point numbers as well as the packed and unpacked version of this instruction will be able to use the same memory constant. This takes advantage of the fact that `VFIXUPIMM*` doesn't just copy from the source, but it will convert to `0.0` if it turns out that it is a denormal and the `MXCSR.DAZ` flag is set. ``` tsrc[31:0]←((src1[30:23] = 0) AND (MXCSR.DAZ =1)) ? 0.0 : src1[31:0] ... CASE(token_response[3:0]) { ... 0001: dest[31:0]←tsrc[31:0]; ; pass through src1 normal input value, denormal as zero ... ``` --- .../backend/x64/emit_x64_floating_point.cpp | 22 +++++++++++++++++++ .../x64/emit_x64_vector_floating_point.cpp | 21 ++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index 1f067f28..a66920ad 100644 --- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/block_of_code.h" @@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u; // 2^63 as a double (actua template void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list to_daz) { if (ctx.FPCR().FZ()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element(denormal_to_zero); + + const Xbyak::Xmm tmp = xmm16; + FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimms)(xmm, xmm, tmp, u8(0)); + } + return; + } + for (const Xbyak::Xmm& xmm : to_daz) { code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask)); code.andps(xmm0, xmm); diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 8ce25004..416f3e1d 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include "dynarmic/backend/x64/abi.h" #include "dynarmic/backend/x64/block_of_code.h" @@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) { template void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list to_daz, Xbyak::Xmm tmp) { if (fpcr.FZ()) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + constexpr u32 denormal_to_zero = FixupLUT( + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src, + FpFixup::Norm_Src); + constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element(denormal_to_zero); + + FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64))); + + for (const Xbyak::Xmm& xmm : to_daz) { + FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0)); + } + return; + } + if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) { code.movaps(tmp, GetNegativeZeroVector(code)); } else {