From 6367a26e62bc649b2e99c2a7e1a6d8c8771e6642 Mon Sep 17 00:00:00 2001
From: Wunkolo <Wunkolo@gmail.com>
Date: Sat, 18 Jun 2022 00:02:59 -0700
Subject: [PATCH] emit_x64_{vector_}floating_point: Add AVX512 implementation
 for `DenormalsAreZero`
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both single and double precision floating point numbers as well as the
packed and unpacked version of this instruction will be able to use the
same memory constant. This takes advantage of the fact that `VFIXUPIMM*`
doesn't just copy from the source, but it will convert to `0.0` if it
turns out that it is a denormal and the `MXCSR.DAZ` flag is set.

```
tsrc[31:0]←((src1[30:23] = 0) AND (MXCSR.DAZ =1)) ? 0.0 : src1[31:0]
...
CASE(token_response[3:0]) {
    ...
    0001: dest[31:0]←tsrc[31:0]; ; pass through src1 normal input value, denormal as zero
    ...
```
---
 .../backend/x64/emit_x64_floating_point.cpp   | 22 +++++++++++++++++++
 .../x64/emit_x64_vector_floating_point.cpp    | 21 ++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
index 1f067f28..a66920ad 100644
--- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp
@@ -16,6 +16,7 @@
 #include <mcl/mp/typelist/lower_to_tuple.hpp>
 #include <mcl/stdint.hpp>
 #include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
 
 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/block_of_code.h"
@@ -79,6 +80,27 @@ constexpr u64 f64_max_s64_lim = 0x43e0000000000000u;  // 2^63 as a double (actua
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, EmitContext& ctx, std::initializer_list<Xbyak::Xmm> to_daz) {
     if (ctx.FPCR().FZ()) {
+        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+            constexpr u32 denormal_to_zero = FixupLUT(
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src);
+            constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
+
+            const Xbyak::Xmm tmp = xmm16;
+            FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
+
+            for (const Xbyak::Xmm& xmm : to_daz) {
+                FCODE(vfixupimms)(xmm, xmm, tmp, u8(0));
+            }
+            return;
+        }
+
         for (const Xbyak::Xmm& xmm : to_daz) {
             code.movaps(xmm0, code.MConst(xword, fsize == 32 ? f32_non_sign_mask : f64_non_sign_mask));
             code.andps(xmm0, xmm);
diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
index 8ce25004..416f3e1d 100644
--- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
+++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp
@@ -18,6 +18,7 @@
 #include <mcl/mp/typelist/lower_to_tuple.hpp>
 #include <mcl/type_traits/function_info.hpp>
 #include <mcl/type_traits/integer_of_size.hpp>
+#include <xbyak/xbyak.h>
 
 #include "dynarmic/backend/x64/abi.h"
 #include "dynarmic/backend/x64/block_of_code.h"
@@ -223,6 +224,26 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm result) {
 template<size_t fsize>
 void DenormalsAreZero(BlockOfCode& code, FP::FPCR fpcr, std::initializer_list<Xbyak::Xmm> to_daz, Xbyak::Xmm tmp) {
     if (fpcr.FZ()) {
+        if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) {
+            constexpr u32 denormal_to_zero = FixupLUT(
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src,
+                FpFixup::Norm_Src);
+            constexpr u64 denormal_to_zero64 = mcl::bit::replicate_element<fsize, u64>(denormal_to_zero);
+
+            FCODE(vmovap)(tmp, code.MConst(xword, u64(denormal_to_zero64), u64(denormal_to_zero64)));
+
+            for (const Xbyak::Xmm& xmm : to_daz) {
+                FCODE(vfixupimmp)(xmm, xmm, tmp, u8(0));
+            }
+            return;
+        }
+
         if (fpcr.RMode() != FP::RoundingMode::TowardsMinusInfinity) {
             code.movaps(tmp, GetNegativeZeroVector<fsize>(code));
         } else {