From ccaf2246970adb5eff43f442a2edb36b163c09b5 Mon Sep 17 00:00:00 2001 From: Ty Lamontagne Date: Mon, 23 Dec 2024 18:33:40 -0500 Subject: [PATCH] EE Cache: SIMD the EE cache lookup routine --- pcsx2/COP0.cpp | 34 ++++++++++++++++++-- pcsx2/R5900.cpp | 5 ++- pcsx2/R5900.h | 36 ++++++++++++++++++++- pcsx2/vtlb.cpp | 83 ++++++++++++++++++++++++++++++++----------------- 4 files changed, 123 insertions(+), 35 deletions(-) diff --git a/pcsx2/COP0.cpp b/pcsx2/COP0.cpp index f346f10117406..89a8a1de46212 100644 --- a/pcsx2/COP0.cpp +++ b/pcsx2/COP0.cpp @@ -280,6 +280,15 @@ void MapTLB(const tlbs& t, int i) } } +__inline int ConvertPageMask(const u32 PageMask) +{ + const u32 mask = std::popcount(PageMask >> 13); + + pxAssertMsg(!((mask & 1) || mask > 12), "Invalid page mask for this TLB entry. EE cache doesn't know what to do here."); + + return (1 << (12 + mask)) - 1; +} + void UnmapTLB(const tlbs& t, int i) { //Console.WriteLn("Clear TLB %d: %08x-> [%08x %08x] S=%d G=%d ASID=%d Mask= %03X", i,t.VPN2,t.PFN0,t.PFN1,t.S,t.G,t.ASID,t.Mask); @@ -324,7 +333,19 @@ void UnmapTLB(const tlbs& t, int i) } } - cachedTlbs.erase(std::remove(cachedTlbs.begin(), cachedTlbs.end(), &t), cachedTlbs.end()); + for (size_t i = 0; i < cachedTlbs.count; i++) + { + if (cachedTlbs.PFN0s[i] == t.PFN0() && cachedTlbs.PFN1s[i] == t.PFN1() && cachedTlbs.PageMasks[i] == ConvertPageMask(t.PageMask.UL)) + { + cachedTlbs.PFN0s.erase(cachedTlbs.PFN0s.begin() + i); + cachedTlbs.PFN1s.erase(cachedTlbs.PFN1s.begin() + i); + cachedTlbs.PageMasks.erase(cachedTlbs.PageMasks.begin() + i); + cachedTlbs.CacheEnabled0.erase(cachedTlbs.CacheEnabled0.begin() + i); + cachedTlbs.CacheEnabled1.erase(cachedTlbs.CacheEnabled1.begin() + i); + cachedTlbs.count--; + break; + } + } } void WriteTLB(int i) @@ -335,7 +356,16 @@ void WriteTLB(int i) tlb[i].EntryLo1.UL = cpuRegs.CP0.n.EntryLo1; if (!tlb[i].isSPR() && ((tlb[i].EntryLo0.V && tlb[i].EntryLo0.isCached()) || (tlb[i].EntryLo1.V && tlb[i].EntryLo1.isCached()))) - cachedTlbs.push_back(&tlb[i]); + { + const size_t idx = cachedTlbs.count; + cachedTlbs.CacheEnabled0[idx] = tlb[i].EntryLo0.isCached() ? ~0 : 0; + cachedTlbs.CacheEnabled1[idx] = tlb[i].EntryLo1.isCached() ? ~0 : 0; + cachedTlbs.PFN1s[idx] = tlb[i].PFN1(); + cachedTlbs.PFN0s[idx] = tlb[i].PFN0(); + cachedTlbs.PageMasks[idx] = ConvertPageMask(tlb[i].PageMask.UL); + + cachedTlbs.count++; + } MapTLB(tlb[i], i); } diff --git a/pcsx2/R5900.cpp b/pcsx2/R5900.cpp index 61dc588e70872..a4ff7bc222132 100644 --- a/pcsx2/R5900.cpp +++ b/pcsx2/R5900.cpp @@ -36,7 +36,7 @@ u32 EEoCycle; alignas(16) cpuRegistersPack _cpuRegistersPack; alignas(16) tlbs tlb[48]; -std::vector cachedTlbs; +cachedTlbs_t cachedTlbs; R5900cpu *Cpu = NULL; @@ -61,8 +61,7 @@ void cpuReset() std::memset(&cpuRegs, 0, sizeof(cpuRegs)); std::memset(&fpuRegs, 0, sizeof(fpuRegs)); std::memset(&tlb, 0, sizeof(tlb)); - cachedTlbs.clear(); - cachedTlbs.reserve(48); + cachedTlbs.reset(); cpuRegs.pc = 0xbfc00000; //set pc reg to stack cpuRegs.CP0.n.Config = 0x440; diff --git a/pcsx2/R5900.h b/pcsx2/R5900.h index bf7f156701fd4..66f8b00145a71 100644 --- a/pcsx2/R5900.h +++ b/pcsx2/R5900.h @@ -215,6 +215,14 @@ struct tlbs constexpr u32 Mask() const { return PageMask.Mask; } constexpr bool isGlobal() const { return EntryLo0.G && EntryLo1.G; } constexpr bool isSPR() const { return EntryLo0.S; } + + constexpr bool operator==(const tlbs& other) const + { + return PageMask.UL == other.PageMask.UL && + EntryHi.UL == other.EntryHi.UL && + EntryLo0.UL == other.EntryLo0.UL && + EntryLo1.UL == other.EntryLo1.UL; + } }; #ifndef _PC_ @@ -254,7 +262,33 @@ struct cpuRegistersPack alignas(16) extern cpuRegistersPack _cpuRegistersPack; alignas(16) extern tlbs tlb[48]; -extern std::vector cachedTlbs; + +struct cachedTlbs_t +{ + u32 count; + std::vector PageMasks; + std::vector PFN1s; + std::vector CacheEnabled1; + std::vector PFN0s; + std::vector CacheEnabled0; + + inline void reset() + { + count = 0; + PageMasks.clear(); + PageMasks.resize(48); + PFN1s.clear(); + PFN1s.resize(48); + PFN0s.clear(); + PFN0s.resize(48); + CacheEnabled1.clear(); + CacheEnabled1.resize(48); + CacheEnabled0.clear(); + CacheEnabled0.resize(48); + } +}; + +extern cachedTlbs_t cachedTlbs; static cpuRegisters& cpuRegs = _cpuRegistersPack.cpuRegs; static fpuRegisters& fpuRegs = _cpuRegistersPack.fpuRegs; diff --git a/pcsx2/vtlb.cpp b/pcsx2/vtlb.cpp index e4d8959ff7a7e..ee14fce04307d 100644 --- a/pcsx2/vtlb.cpp +++ b/pcsx2/vtlb.cpp @@ -30,6 +30,7 @@ #include "fmt/core.h" #include +#include #include #include #include @@ -109,47 +110,71 @@ vtlb_private::VTLBVirtual::VTLBVirtual(VTLBPhysical phys, u32 paddr, u32 vaddr) } } -__inline int ConvertPageMask(u32 PageMask) +__noinline int CheckCache(u32 addr) { - const u32 mask = std::popcount(PageMask >> 13); + // Check if the cache is enabled + if (((cpuRegs.CP0.n.Config >> 16) & 0x1) == 0) + { + return false; + } - pxAssertMsg(!((mask & 1) || mask > 12), "Invalid page mask for this TLB entry. EE cache doesn't know what to do here."); + const size_t size = cachedTlbs.count; + const int stride = 4; - return (1 << (12 + mask)) - 1; -} + __m128i addr_vec = _mm_set1_epi32(addr); -__inline int CheckCache(u32 addr) -{ - u32 mask; + size_t i = 0; - if (((cpuRegs.CP0.n.Config >> 16) & 0x1) == 0) + for (; i + stride <= size; i += stride) { - //DevCon.Warning("Data Cache Disabled! %x", cpuRegs.CP0.n.Config); - return false; // - } + __m128i pfn1_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PFN1s[i])); + __m128i pfn0_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PFN0s[i])); + __m128i mask_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.PageMasks[i])); - for (int i = 0; i < cachedTlbs.size(); i++) - { - const auto& entry = cachedTlbs[i]; - if (entry->EntryLo1.isCached()) + __m128i cached1_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.CacheEnabled1[i])); + __m128i cached0_vec = _mm_loadu_si128(reinterpret_cast(&cachedTlbs.CacheEnabled0[i])); + + __m128i pfn1_end_vec = _mm_add_epi32(pfn1_vec, mask_vec); + __m128i pfn0_end_vec = _mm_add_epi32(pfn0_vec, mask_vec); + + __m128i gteLowerBound0 = _mm_or_si128( + _mm_cmpgt_epi32(addr_vec, pfn0_vec), + _mm_cmpeq_epi32(addr_vec, pfn0_vec)); + __m128i gteUpperBound0 = _mm_or_si128( + _mm_cmpgt_epi32(pfn0_end_vec, addr_vec), + _mm_cmpeq_epi32(pfn0_end_vec, addr_vec)); + + __m128i gteUpperBound1 = _mm_or_si128( + _mm_cmpgt_epi32(pfn1_end_vec, addr_vec), + _mm_cmpeq_epi32(pfn1_end_vec, addr_vec)); + __m128i gteLowerBound1 = _mm_or_si128( + _mm_cmpgt_epi32(addr_vec, pfn1_vec), + _mm_cmpeq_epi32(addr_vec, pfn1_vec)); + + __m128i cmp0 = _mm_and_si128(gteLowerBound0, gteUpperBound0); + __m128i cmp1 = _mm_and_si128(gteLowerBound1, gteUpperBound1); + + cmp1 = _mm_and_si128(cmp1, cached1_vec); + cmp0 = _mm_and_si128(cmp0, cached0_vec); + + __m128i cmp = _mm_or_si128(cmp1, cmp0); + + if (!_mm_testz_si128(cmp, cmp)) { - mask = ConvertPageMask(entry->PageMask.UL); - if ((addr >= entry->PFN1()) && (addr <= entry->PFN1() + mask)) - { - //DevCon.Warning("Yay! Cache check cache addr=%x, mask=%x, addr+mask=%x, VPN2=%x PFN0=%x", addr, mask, (addr & mask), tlb[i].VPN2, tlb[i].PFN0); - return true; - } + return true; } - if (entry->EntryLo0.isCached()) + } + + for (; i < size; i++) + { + const u32 mask = cachedTlbs.PageMasks[i]; + if ((cachedTlbs.CacheEnabled1[i] && addr >= cachedTlbs.PFN1s[i] && addr <= cachedTlbs.PFN1s[i] + mask) || + (cachedTlbs.CacheEnabled0[i] && addr >= cachedTlbs.PFN0s[i] && addr <= cachedTlbs.PFN0s[i] + mask)) { - mask = ConvertPageMask(entry->PageMask.UL); - if ((addr >= entry->PFN0()) && (addr <= entry->PFN0() + mask)) - { - //DevCon.Warning("Yay! Cache check cache addr=%x, mask=%x, addr+mask=%x, VPN2=%x PFN0=%x", addr, mask, (addr & mask), tlb[i].VPN2, tlb[i].PFN0); - return true; - } + return true; } } + return false; } // --------------------------------------------------------------------------------------