From 4d78d167d6d0bdcf73e6f3f919f721b86ab441e2 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Sat, 18 Jun 2022 01:12:36 -0700 Subject: [PATCH] emit_x64_{vector_}floating_point: Add AVX512 implementation for `ForceToDefaultNaN` `vfpclassp* k, xmm, i8` has better latency(4->3) and allocates better execution ports(01->5) that are out of the way of ALU-ports than `vcmpunordp* xmm, xmm, xmm`(`vcmpp* xmm, xmm, xmm, i8`) and removes the pipeline dependency on `xmm0` in favor AVX512 `k`-mask registers. `vblendmp* xmm, k, xmm, mem` is about the same throughput and latency as `blendvp* xmm. mem` but has the benefit of embedded broadcasts to reduce memory bandwidth(32/64-bit read rather than 128-bit) and lends itself to a future size optimization feature of `constant_pool`. --- src/dynarmic/backend/x64/emit_x64_floating_point.cpp | 6 +++++- .../backend/x64/emit_x64_vector_floating_point.cpp | 9 +++++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp index a66920ad..d45e97ba 100644 --- a/src/dynarmic/backend/x64/emit_x64_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_floating_point.cpp @@ -136,7 +136,11 @@ void ZeroIfNaN(BlockOfCode& code, Xbyak::Xmm xmm_value, Xbyak::Xmm xmm_scratch) template void ForceToDefaultNaN(BlockOfCode& code, Xbyak::Xmm result) { - if (code.HasHostFeature(HostFeature::AVX)) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + const Xbyak::Opmask nan_mask = k1; + FCODE(vfpclasss)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN)); + FCODE(vblendmp)(result | nan_mask, result, code.MConst(ptr_b, fsize == 32 ? f32_nan : f64_nan)); + } else if (code.HasHostFeature(HostFeature::AVX)) { FCODE(vcmpunords)(xmm0, result, result); FCODE(blendvp)(result, code.MConst(xword, fsize == 32 ? f32_nan : f64_nan)); } else { diff --git a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp index 416f3e1d..c34ac3d6 100644 --- a/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp +++ b/src/dynarmic/backend/x64/emit_x64_vector_floating_point.cpp @@ -190,11 +190,16 @@ Xbyak::Address GetVectorOf(BlockOfCode& code) { template void ForceToDefaultNaN(BlockOfCode& code, FP::FPCR fpcr, Xbyak::Xmm result) { if (fpcr.DN()) { - const Xbyak::Xmm nan_mask = xmm0; - if (code.HasHostFeature(HostFeature::AVX)) { + if (code.HasHostFeature(HostFeature::AVX512_OrthoFloat)) { + const Xbyak::Opmask nan_mask = k1; + FCODE(vfpclassp)(nan_mask, result, u8(FpClass::QNaN | FpClass::SNaN)); + FCODE(vblendmp)(result | nan_mask, result, GetNaNVector(code)); + } else if (code.HasHostFeature(HostFeature::AVX)) { + const Xbyak::Xmm nan_mask = xmm0; FCODE(vcmpunordp)(nan_mask, result, result); FCODE(blendvp)(result, GetNaNVector(code)); } else { + const Xbyak::Xmm nan_mask = xmm0; code.movaps(nan_mask, result); FCODE(cmpordp)(nan_mask, nan_mask); code.andps(result, nan_mask);