Merge pull request #640 from wheremyfoodat/wheremyfoodat-patch-2

Force-inline SIMD index buffer functions
wheremyfoodat · Nov 20, 2024 · 43991b7 · 43991b7
2 parents 224ddac + bea7b00
commit 43991b7
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 5 deletions.
diff --git a/include/PICA/pica_simd.hpp b/include/PICA/pica_simd.hpp
@@ -3,6 +3,7 @@
 #include <limits>
 #include <utility>
 
+#include "compiler_builtins.hpp"
 #include "helpers.hpp"
 
 #if defined(_M_AMD64) || defined(__x86_64__)
@@ -43,7 +44,7 @@ namespace PICA::IndexBuffer {
 
 #ifdef PICA_SIMD_ARM64
 	template <bool useShortIndices>
-	std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
+	ALWAYS_INLINE std::pair<u16, u16> analyzeNEON(u8* indexBuffer, u32 vertexCount) {
 		// We process 16 bytes per iteration, which is 8 vertices if we're using u16 indices or 16 vertices if we're using u8 indices
 		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;
 
@@ -134,7 +135,7 @@ namespace PICA::IndexBuffer {
 
 #if defined(PICA_SIMD_X64) && (defined(__SSE4_1__) || defined(__AVX__))
 	template <bool useShortIndices>
-	std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
+	ALWAYS_INLINE std::pair<u16, u16> analyzeSSE4_1(u8* indexBuffer, u32 vertexCount) {
 		// We process 16 bytes per iteration, which is 8 vertices if we're using u16
 		// indices or 16 vertices if we're using u8 indices
 		constexpr u32 vertsPerLoop = (useShortIndices) ? 8 : 16;

diff --git a/src/core/PICA/draw_acceleration.cpp b/src/core/PICA/draw_acceleration.cpp
@@ -90,7 +90,11 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 			const u32 size = (attribInfo >> 2) + 1;   // Total number of components
 
 			// Size of each component based on the attribute type
-			static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			[[maybe_unused]] static constexpr u32 sizePerComponent[4] = {1, 1, 2, 4};
+			// To avoid a multiplication, instead of multiplying by the above values, we shift left instead
+			// So multiplication by 1 becomes a shift by 0, mul by 2 becomes a shift by 1, and mul by 4 becomes a shift by 2 
+			static constexpr u32 sizeShiftPerComponent[4] = {0, 0, 1, 2};
+
 			const u32 inputReg = (inputAttrCfg >> (attributeIndex * 4)) & 0xf;
 			// Mark the attribute as enabled
 			accel.enabledAttributeMask |= 1 << inputReg;
@@ -100,7 +104,7 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 			attr.offset = attributeOffset + loaderOffset;
 			attr.stride = loaderData.size;
 			attr.type = attribType;
-			attributeOffset += size * sizePerComponent[attribType];
+			attributeOffset += size << sizeShiftPerComponent[attribType];
 		}
 
 		loaderOffset += loader.size;
@@ -134,4 +138,4 @@ void GPU::getAcceleratedDrawInfo(PICA::DrawAcceleration& accel, bool indexed) {
 	}
 
 	accel.canBeAccelerated = true;
-}
+}