diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 6aceceaa4..8f90c9b00 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -103,6 +103,7 @@ add_library(citra_common STATIC logging/text_formatter.cpp logging/text_formatter.h logging/types.h + math_util.cpp math_util.h memory_detect.cpp memory_detect.h diff --git a/src/common/math_util.cpp b/src/common/math_util.cpp new file mode 100644 index 000000000..f9e7b5a35 --- /dev/null +++ b/src/common/math_util.cpp @@ -0,0 +1,151 @@ +// Copyright Citra Emulator Project / Azahar Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include "math_util.h" + +#if defined(CITRA_HAS_SSE42) +#include +#include +#endif + +#if defined(__aarch64__) || defined(__ARM_NEON) +#define CITRA_HAS_NEON +#include +#endif + +#if defined(_MSC_VER) +#define DISABLE_VECTORIZE __pragma(loop(no_vector)) +#elif defined(__clang__) +#define DISABLE_VECTORIZE _Pragma("clang loop vectorize(disable)") +#elif defined(__GNUC__) +#define DISABLE_VECTORIZE _Pragma("GCC novector") +#else +#define DISABLE_VECTORIZE +#endif + +namespace Common { +std::pair FindMinMax(const std::span& data) { + const size_t count = data.size(); + const u8* data_ptr = data.data(); + u8 final_min, final_max; +#if defined(CITRA_HAS_SSE42) || defined(CITRA_HAS_NEON) + u8 simd_min = 0xFF; + u8 simd_max = 0; + size_t i = 0; + constexpr size_t simd_line_count = 16; + constexpr size_t count_threshold = simd_line_count * 2; + if (count >= count_threshold) { +#if defined(CITRA_HAS_SSE42) + __m128i vmin = _mm_set1_epi8(static_cast(0xFF)); + __m128i vmax = _mm_setzero_si128(); + for (; i + simd_line_count <= count; i += simd_line_count) { + __m128i vals = _mm_loadu_si128(reinterpret_cast(data_ptr + i)); + vmin = _mm_min_epu8(vmin, vals); + vmax = _mm_max_epu8(vmax, vals); + } + alignas(16) u8 tmp[simd_line_count]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), vmin); + simd_min = *std::min_element(tmp, tmp + simd_line_count); + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), vmax); + simd_max = *std::max_element(tmp, tmp + simd_line_count); +#elif defined(CITRA_HAS_NEON) + uint8x16_t vmin = vdupq_n_u8(0xFF); + uint8x16_t vmax = vdupq_n_u8(0); + for (; i + simd_line_count <= count; i += simd_line_count) { + uint8x16_t vals = vld1q_u8(data_ptr + i); + vmin = vminq_u8(vmin, vals); + vmax = vmaxq_u8(vmax, vals); + } + alignas(16) uint8_t tmp[simd_line_count]; + vst1q_u8(tmp, vmin); + simd_min = *std::min_element(tmp, tmp + simd_line_count); + vst1q_u8(tmp, vmax); + simd_max = *std::max_element(tmp, tmp + simd_line_count); +#endif // CITRA_HAS_SSE42 + } + DISABLE_VECTORIZE + for (; i < count; ++i) { + const u8 val = data_ptr[i]; + simd_min = std::min(simd_min, val); + simd_max = std::max(simd_max, val); + } + + final_min = simd_min; + final_max = simd_max; + +#else + // Scalar fallback + for (size_t i = 0; i < count; ++i) { + const u8 val = data_ptr[i]; + final_min = std::min(final_min, val); + final_max = std::max(final_max, val); + } +#endif // CITRA_HAS_SSE42 || CITRA_HAS_NEON + + return {final_min, final_max}; +} + +std::pair FindMinMax(const std::span& data) { + const size_t count = data.size(); + const u16* data_ptr = data.data(); + u16 final_min, final_max; + +#if defined(CITRA_HAS_SSE42) || defined(CITRA_HAS_NEON) + u16 simd_min = 0xFFFF; + u16 simd_max = 0; + size_t i = 0; + constexpr size_t simd_line_count = 8; + constexpr size_t count_threshold = simd_line_count * 2; + if (count >= count_threshold) { +#if defined(CITRA_HAS_SSE42) + __m128i vmin = _mm_set1_epi16(static_cast(0xFFFF)); + __m128i vmax = _mm_setzero_si128(); + for (; i + simd_line_count <= count; i += simd_line_count) { + __m128i vals = _mm_loadu_si128(reinterpret_cast(data_ptr + i)); + vmin = _mm_min_epu16(vmin, vals); + vmax = _mm_max_epu16(vmax, vals); + } + alignas(16) u16 tmp[simd_line_count]; + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), vmin); + simd_min = *std::min_element(tmp, tmp + simd_line_count); + _mm_storeu_si128(reinterpret_cast<__m128i*>(tmp), vmax); + simd_max = *std::max_element(tmp, tmp + simd_line_count); +#elif defined(CITRA_HAS_NEON) + uint16x8_t vmin = vdupq_n_u16(static_cast(0xFFFF)); + uint16x8_t vmax = vdupq_n_u16(0); + for (; i + simd_line_count <= count; i += simd_line_count) { + uint16x8_t vals = vld1q_u16(data_ptr + i); + vmin = vminq_u16(vmin, vals); + vmax = vmaxq_u16(vmax, vals); + } + alignas(16) uint16_t tmp[simd_line_count]; + vst1q_u16(tmp, vmin); + simd_min = *std::min_element(tmp, tmp + simd_line_count); + vst1q_u16(tmp, vmax); + simd_max = *std::max_element(tmp, tmp + simd_line_count); +#endif // CITRA_HAS_SSE42 + } + DISABLE_VECTORIZE + for (; i < count; ++i) { + const u16 val = data_ptr[i]; + simd_min = std::min(simd_min, val); + simd_max = std::max(simd_max, val); + } + + final_min = simd_min; + final_max = simd_max; + +#else + // Scalar fallback + for (u32 i = 0; i < count; ++i) { + const u16 val = data_ptr[i]; + final_min = std::min(final_min, val); + final_max = std::max(final_max, val); + } +#endif // CITRA_HAS_SSE42 || CITRA_HAS_NEON + + return {final_min, final_max}; +} +} // namespace Common diff --git a/src/common/math_util.h b/src/common/math_util.h index 7b5513788..9587fba93 100644 --- a/src/common/math_util.h +++ b/src/common/math_util.h @@ -9,7 +9,10 @@ #pragma once #include +#include #include +#include +#include "common_types.h" namespace Common { @@ -73,4 +76,7 @@ struct Rectangle { template Rectangle(T, T, T, T) -> Rectangle; +std::pair FindMinMax(const std::span& data); +std::pair FindMinMax(const std::span& data); + } // namespace Common diff --git a/src/video_core/rasterizer_accelerated.cpp b/src/video_core/rasterizer_accelerated.cpp index c6ebfdb70..d7f018405 100644 --- a/src/video_core/rasterizer_accelerated.cpp +++ b/src/video_core/rasterizer_accelerated.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include "common/alignment.h" +#include "common/math_util.h" #include "core/memory.h" #include "video_core/pica/pica_core.h" #include "video_core/rasterizer_accelerated.h" @@ -103,12 +104,19 @@ RasterizerAccelerated::VertexArrayInfo RasterizerAccelerated::AnalyzeVertexArray vertex_min = 0xFFFF; vertex_max = 0; - const u32 size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1); + const u32 count = regs.pipeline.num_vertices; + const u32 index_size = index_u16 ? 2 : 1; + const u32 size = count * index_size; FlushRegion(address, size); - for (u32 index = 0; index < regs.pipeline.num_vertices; ++index) { - const u32 vertex = index_u16 ? index_address_16[index] : index_address_8[index]; - vertex_min = std::min(vertex_min, vertex); - vertex_max = std::max(vertex_max, vertex); + + if (index_u16) { + const auto res = Common::FindMinMax({index_address_16, static_cast(count)}); + vertex_min = static_cast(res.first); + vertex_max = static_cast(res.second); + } else { + const auto res = Common::FindMinMax({index_address_8, static_cast(count)}); + vertex_min = static_cast(res.first); + vertex_max = static_cast(res.second); } } else { vertex_min = regs.pipeline.vertex_offset;