diff --git a/.gitignore b/.gitignore index 27dca716..fce02145 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,9 @@ *.asm *.s -*.o -/obj +*.o +*.so +obj *.obj *.dll diff --git a/lto.c b/lto.c new file mode 100644 index 00000000..4765a53d --- /dev/null +++ b/lto.c @@ -0,0 +1,39 @@ +/******************************************************************************\ +* Project: Primitive LTO Merger Substitute * +* Authors: Iconoclast * +* Release: 2018.03.17 * +* License: CC0 Public Domain Dedication * +* * +* To the extent possible under law, the author(s) have dedicated all copyright * +* and related and neighboring rights to this software to the public domain * +* worldwide. This software is distributed without any warranty. * +* * +* You should have received a copy of the CC0 Public Domain Dedication along * +* with this software. * +* If not, see . * +\******************************************************************************/ + +/* + * A single compile-and-link command will be sufficient with this method. + * + * A command exemplifying this on UNIX with all optimizations in tact may be: + * $ cc --shared -o rsp.so lto.c -O3 -msse2 -DARCH_MIN_SSE2 -s + * + * To control the link-time stage during build with a separate command: + * $ gcc -c -o rsp.o lto.c -O3 -msse2 -DARCH_MIN_SSE2 + * $ ld --shared -o rsp.so -lc rsp.o --strip-all + */ + +#include "module.c" +#include "su.c" + +#include "vu/vu.c" + +#include "vu/multiply.c" +#include "vu/add.c" +#include "vu/select.c" +#include "vu/logical.c" +#include "vu/divide.c" +#if 0 +#include "vu/pack.c" +#endif diff --git a/make.sh b/make.sh index 77ff4a78..10040dc1 100755 --- a/make.sh +++ b/make.sh @@ -1,8 +1,6 @@ mkdir -p obj mkdir -p obj/vu -# The below path configuration will only work if you have this `make.sh` script -# installed to the parent directory just outside the RSP source when you run it. src="." # or an absolute path, like "/home/user/rsp" obj="$src/obj" @@ -16,18 +14,10 @@ OBJ_LIST="\ $obj/vu/logical.o \ $obj/vu/divide.o" -FLAGS_ANSI="\ - -O3 \ - -fPIC \ - -DPLUGIN_API_VERSION=0x0101 \ - -march=native \ - -mstackrealign \ - -Wall \ - -pedanticz" +FLAGS_ANSI="-fPIC -DPLUGIN_API_VERSION=0x0101 -mstackrealign -Wall -pedantic" if [ `uname -m` == 'x86_64' ]; then FLAGS_x86="\ - -O3 \ -masm=intel \ -fPIC \ -DPLUGIN_API_VERSION=0x0101 \ @@ -38,11 +28,10 @@ FLAGS_x86="\ -pedantic \ -Wall -Wshadow -Wredundant-decls -Wextra -Wcast-align -Wcast-qual \ -Wdisabled-optimization -Wformat=2 -Winit-self -Wlogical-op - -Wmissing-include-dirs -Wstrict-overflow=5 -Wundef -Wno-unused \ + -Wmissing-include-dirs -Wstrict-overflow=1 -Wundef -Wno-unused \ -Wno-variadic-macros -Wno-parentheses -fdiagnostics-show-option" else FLAGS_x86="\ - -O3 \ -masm=intel \ -DPLUGIN_API_VERSION=0x0101 \ -DARCH_MIN_SSE2 \ @@ -52,25 +41,25 @@ FLAGS_x86="\ -pedantic \ -Wall -Wshadow -Wredundant-decls -Wextra -Wcast-align -Wcast-qual \ -Wdisabled-optimization -Wformat=2 -Winit-self -Wlogical-op - -Wmissing-include-dirs -Wstrict-overflow=5 -Wundef -Wno-unused \ + -Wmissing-include-dirs -Wstrict-overflow=1 -Wundef -Wno-unused \ -Wno-variadic-macros -Wno-parentheses -fdiagnostics-show-option" fi C_FLAGS=$FLAGS_x86 # default since Intel SIMD was the most tested echo Compiling C source code... -cc -S $C_FLAGS -o $obj/module.s $src/module.c -cc -S $C_FLAGS -o $obj/su.s $src/su.c -cc -S $C_FLAGS -o $obj/vu/vu.s $src/vu/vu.c -cc -S $C_FLAGS -o $obj/vu/multiply.s $src/vu/multiply.c -cc -S $C_FLAGS -o $obj/vu/add.s $src/vu/add.c -cc -S $C_FLAGS -o $obj/vu/select.s $src/vu/select.c -cc -S $C_FLAGS -o $obj/vu/logical.s $src/vu/logical.c -cc -S $C_FLAGS -o $obj/vu/divide.s $src/vu/divide.c +cc -S -Os $C_FLAGS -o $obj/module.s $src/module.c +cc -S -O3 $C_FLAGS -o $obj/su.s $src/su.c +cc -S -O3 $C_FLAGS -o $obj/vu/vu.s $src/vu/vu.c +cc -S -O3 $C_FLAGS -o $obj/vu/multiply.s $src/vu/multiply.c +cc -S -O3 $C_FLAGS -o $obj/vu/add.s $src/vu/add.c +cc -S -O3 $C_FLAGS -o $obj/vu/select.s $src/vu/select.c +cc -S -O3 $C_FLAGS -o $obj/vu/logical.s $src/vu/logical.c +cc -S -O2 $C_FLAGS -o $obj/vu/divide.s $src/vu/divide.c echo Assembling compiled sources... -as --statistics -o $obj/module.o $obj/module.s -as --statistics -o $obj/su.o $obj/su.s -as --statistics -o $obj/vu/vu.o $obj/vu/vu.s +as -o $obj/module.o $obj/module.s +as -o $obj/su.o $obj/su.s +as -o $obj/vu/vu.o $obj/vu/vu.s as -o $obj/vu/multiply.o $obj/vu/multiply.s as -o $obj/vu/add.o $obj/vu/add.s as -o $obj/vu/select.o $obj/vu/select.s @@ -78,5 +67,5 @@ as -o $obj/vu/logical.o $obj/vu/logical.s as -o $obj/vu/divide.o $obj/vu/divide.s echo Linking assembled object files... -ld --shared -o $obj/rspdebug.so $OBJ_LIST -strip -o $obj/rsp.so $obj/rspdebug.so +ld --shared -o $obj/rspdebug.so -lc $OBJ_LIST +strip -o $obj/rsp.so $obj/rspdebug.so --strip-all diff --git a/make_w32.cmd b/make_w32.cmd index 463f2066..ab9b4c12 100644 --- a/make_w32.cmd +++ b/make_w32.cmd @@ -1,7 +1,18 @@ @ECHO OFF TITLE MinGW Compiler Suite Invocation +REM If you have MinGW on a different drive letter or installed at a custom path +REM (or just not yet installed at all), this build script may not work out of +REM the box for most Windows users. Alternatives include MinGW-w32 or trying +REM to execute the Unix shell script "make.sh" from Windows 10+ or Git Bash. + +REM The following line is the only one you should ever need to change. set MinGW=C:\MinGW + +set lib=%MinGW%\lib +set bin=%MinGW%\bin +set inc=%MinGW%\include + REM set rsp=%USERPROFILE%\rsp set rsp=%CD% set obj=%rsp%\obj @@ -16,20 +27,16 @@ set OBJ_LIST=^ %obj%\vu\logical.o ^ %obj%\vu\divide.o -set FLAGS_ANSI=-O3^ +set FLAGS_ANSI=-Wall -pedantic^ -DPLUGIN_API_VERSION=0x0101^ - -march=native^ -mstackrealign^ - -Wall^ - -pedantic -set FLAGS_x86=-O3^ + -march=native +set FLAGS_x86=-Wall -pedantic^ -masm=intel^ -DPLUGIN_API_VERSION=0x0101^ -DARCH_MIN_SSE2^ - -march=native^ -mstackrealign^ - -Wall^ - -pedantic + -march=native set C_FLAGS=%FLAGS_x86% if not exist obj ( @@ -37,31 +44,33 @@ mkdir obj cd obj mkdir vu ) -cd %MinGW%\bin +cd /D %bin% ECHO Compiling C source code... -cc -S %C_FLAGS% -o %obj%\module.asm %rsp%\module.c -cc -S %C_FLAGS% -o %obj%\su.asm %rsp%\su.c -cc -S %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c -cc -S %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c -cc -S %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c -cc -S %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c -cc -S %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c -cc -S %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c +@ECHO ON +gcc -Os -S %C_FLAGS% -o %obj%\module.asm %rsp%\module.c +gcc -O3 -S %C_FLAGS% -o %obj%\su.asm %rsp%\su.c +gcc -O3 -S %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c +gcc -O3 -S %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c +gcc -O3 -S %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c +gcc -O3 -S %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c +gcc -O3 -S %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c +gcc -O2 -S %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c +@ECHO OFF ECHO. ECHO Assembling compiled sources... -as --statistics -o %obj%\module.o %obj%\module.asm -as --statistics -o %obj%\su.o %obj%\su.asm -as --statistics -o %obj%\vu\vu.o %obj%\vu\vu.asm -as -o %obj%\vu\multiply.o %obj%\vu\multiply.asm -as -o %obj%\vu\add.o %obj%\vu\add.asm -as -o %obj%\vu\select.o %obj%\vu\select.asm -as -o %obj%\vu\logical.o %obj%\vu\logical.asm -as -o %obj%\vu\divide.o %obj%\vu\divide.asm +as -o %obj%\module.o %obj%\module.asm +as -o %obj%\su.o %obj%\su.asm +as -o %obj%\vu\vu.o %obj%\vu\vu.asm +as -o %obj%\vu\multiply.o %obj%\vu\multiply.asm +as -o %obj%\vu\add.o %obj%\vu\add.asm +as -o %obj%\vu\select.o %obj%\vu\select.asm +as -o %obj%\vu\logical.o %obj%\vu\logical.asm +as -o %obj%\vu\divide.o %obj%\vu\divide.asm ECHO. ECHO Linking assembled object files... -ld --shared -e _DllMain@12 -o %obj%\rspdebug.dll %OBJ_LIST% %MinGW%\lib\libkernel32.a -strip -o %obj%/rsp.dll %obj%/rspdebug.dll +ld --shared -e _DllMain@12 -o %obj%\rspdebug.dll -L %lib% %OBJ_LIST% -lmsvcrt +strip -o %obj%\rsp.dll %obj%\rspdebug.dll --strip-all PAUSE diff --git a/make_w64.cmd b/make_w64.cmd index 11ff4a03..45cf7e7d 100644 --- a/make_w64.cmd +++ b/make_w64.cmd @@ -1,8 +1,22 @@ @ECHO OFF TITLE MinGW Compiler Suite Invocation -set version=x86_64-5.1.0-win32-seh-rt_v4-rev0 -set MinGW="C:\Program Files\mingw-w64\%version%\mingw64" +REM If you have installed MinGW-w64 without using MSYS2 to obtain the package +REM (or just not yet installed at all), this build script may not work out of +REM the box for most Windows users. If you have Cygwin instead or whatever +REM else, be sure to adjust the path below, or execute "make.sh" in a Git shell. + +REM The following line is the only one you should ever need to change. +set mingw64=C:\msys64\mingw64 + +REM The following two variables are irrelevant, unless you set a 32-bit target. +set mingw32=%mingw64%\..\mingw32 +set lib=%mingw32%\i686-w64-mingw32\lib + +set lib64=%mingw64%\x86_64-w64-mingw32\lib +set bin=%mingw64%\bin +set inc=%lib64%\..\include + REM set rsp=%USERPROFILE%\rsp set rsp=%CD% set obj=%rsp%\obj @@ -15,21 +29,18 @@ set OBJ_LIST=^ %obj%\vu\add.o ^ %obj%\vu\select.o ^ %obj%\vu\logical.o ^ -%obj%\vu\divide.o ^ -%MinGW%\x86_64-w64-mingw32\lib\libkernel32.a +%obj%\vu\divide.o -set FLAGS_ANSI=-Wall^ +set FLAGS_ANSI=-Wall -pedantic^ -DPLUGIN_API_VERSION=0x0101^ - -march=native^ -mstackrealign^ - -pedantic -set FLAGS_x86=-Wall^ - -masm=intel^ + -march=native +set FLAGS_x86=-Wall -pedantic^ -DPLUGIN_API_VERSION=0x0101^ -DARCH_MIN_SSE2^ - -march=native^ + -masm=intel^ -mstackrealign^ - -pedantic + -march=native set C_FLAGS=%FLAGS_x86% if not exist obj ( @@ -37,31 +48,33 @@ mkdir obj cd obj mkdir vu ) -cd %MinGW%\bin +cd /D %bin% ECHO Compiling C source code... -%MinGW%\bin\gcc.exe -S -Os %C_FLAGS% -o %obj%\module.asm %rsp%\module.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\su.asm %rsp%\su.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c -%MinGW%\bin\gcc.exe -S -O3 %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c +@ECHO ON +gcc -S -Os %C_FLAGS% -o %obj%\module.asm %rsp%\module.c +gcc -S -O3 %C_FLAGS% -o %obj%\su.asm %rsp%\su.c +gcc -S -O3 %C_FLAGS% -o %obj%\vu\vu.asm %rsp%\vu\vu.c +gcc -S -O3 %C_FLAGS% -o %obj%\vu\multiply.asm %rsp%\vu\multiply.c +gcc -S -O3 %C_FLAGS% -o %obj%\vu\add.asm %rsp%\vu\add.c +gcc -S -O3 %C_FLAGS% -o %obj%\vu\select.asm %rsp%\vu\select.c +gcc -S -O3 %C_FLAGS% -o %obj%\vu\logical.asm %rsp%\vu\logical.c +gcc -S -O2 %C_FLAGS% -o %obj%\vu\divide.asm %rsp%\vu\divide.c +@ECHO OFF ECHO. ECHO Assembling compiled sources... -%MinGW%\bin\as.exe -o %obj%\module.o %obj%\module.asm -%MinGW%\bin\as.exe -o %obj%\su.o %obj%\su.asm -%MinGW%\bin\as.exe -o %obj%\vu\vu.o %obj%\vu\vu.asm -%MinGW%\bin\as.exe -o %obj%\vu\multiply.o %obj%\vu\multiply.asm -%MinGW%\bin\as.exe -o %obj%\vu\add.o %obj%\vu\add.asm -%MinGW%\bin\as.exe -o %obj%\vu\select.o %obj%\vu\select.asm -%MinGW%\bin\as.exe -o %obj%\vu\logical.o %obj%\vu\logical.asm -%MinGW%\bin\as.exe -o %obj%\vu\divide.o %obj%\vu\divide.asm +as -o %obj%\module.o %obj%\module.asm +as -o %obj%\su.o %obj%\su.asm +as -o %obj%\vu\vu.o %obj%\vu\vu.asm +as -o %obj%\vu\multiply.o %obj%\vu\multiply.asm +as -o %obj%\vu\add.o %obj%\vu\add.asm +as -o %obj%\vu\select.o %obj%\vu\select.asm +as -o %obj%\vu\logical.o %obj%\vu\logical.asm +as -o %obj%\vu\divide.o %obj%\vu\divide.asm ECHO. ECHO Linking assembled object files... -%MinGW%\bin\ld.exe --shared -e DllMain -o %obj%\rspdebug.dll %OBJ_LIST% -%MinGW%\bin\strip.exe -o %obj%/rsp.dll %obj%/rspdebug.dll +ld --shared -e DllMain -o %obj%\rspdebug.dll -L%lib64% %OBJ_LIST% -lmsvcrt +strip -o %obj%\rsp.dll %obj%\rspdebug.dll --strip-all PAUSE diff --git a/module.c b/module.c index 77d6eb33..5ac1ea5c 100644 --- a/module.c +++ b/module.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: Module Subsystem Interface to SP Interpreter Core * * Authors: Iconoclast * -* Release: 2016.11.05 * +* Release: 2018.03.21 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -28,6 +28,20 @@ #include "module.h" #include "su.h" +#include +#include + +static jmp_buf CPU_state; +static void seg_av_handler(int signal_code) +{ + longjmp(CPU_state, signal_code); +} +static void ISA_op_illegal(int signal_code) +{ + message("Plugin built for SIMD extensions this CPU does not support!"); + raise(signal_code); /* e.g., rsp.dll built with -mssse3; the CPU is SSE2. */ +} + RSP_INFO RSP_INFO_NAME; #define RSP_CXD4_VERSION 0x0101 @@ -60,7 +74,7 @@ ptr_CoreDoCommand CoreDoCommand = NULL; NOINLINE void update_conf(const char* source) { - memset(conf, 0, sizeof(conf)); + memset(conf, 0, 32); m64p_rom_header ROM_HEADER; CoreDoCommand(M64CMD_ROM_GET_HEADER, sizeof(ROM_HEADER), &ROM_HEADER); @@ -252,7 +266,7 @@ EXPORT void CALL DllAbout(p_void hParent) EXPORT void CALL DllConfig(p_void hParent) { - my_system("sp_cfgui"); + system("sp_cfgui"); update_conf(CFG_FILE); if (DMEM == IMEM || GET_RCP_REG(SP_PC_REG) % 4096 == 0x00000000) @@ -269,27 +283,28 @@ EXPORT void CALL DllConfig(p_void hParent) EXPORT unsigned int CALL DoRspCycles(unsigned int cycles) { + static char task_debug[] = "unknown task type: 0x????????"; + char* task_debug_type; OSTask_type task_type; register unsigned int i; - if (GET_RCP_REG(SP_STATUS_REG) & 0x00000003) - { + if (GET_RCP_REG(SP_STATUS_REG) & 0x00000003) { message("SP_STATUS_HALT"); return 0x00000000; } + task_debug_type = &task_debug[strlen("unknown task type: 0x")]; - task_type = 0x00000000 #ifdef USE_CLIENT_ENDIAN - | *((pi32)(DMEM + 0x000FC0U)) + memcpy(&task_type, DMEM + 0xFC0, 4); #else - | (u32)DMEM[0xFC0] << 24 - | (u32)DMEM[0xFC1] << 16 - | (u32)DMEM[0xFC2] << 8 - | (u32)DMEM[0xFC3] << 0 -#endif + task_type = 0x00000000 + | (u32)(DMEM[0xFC0 ^ 0] & 0xFFu) << 24 + | (u32)(DMEM[0xFC1 ^ 0] & 0xFFu) << 16 + | (u32)(DMEM[0xFC2 ^ 0] & 0xFFu) << 8 + | (u32)(DMEM[0xFC3 ^ 0] & 0xFFu) << 0 ; +#endif switch (task_type) { -#ifdef EXTERN_COMMAND_LIST_GBI case M_GFXTASK: if (CFG_HLE_GFX == 0) break; @@ -299,10 +314,17 @@ EXPORT unsigned int CALL DoRspCycles(unsigned int cycles) GET_RCP_REG(SP_STATUS_REG) |= SP_STATUS_SIG2 | SP_STATUS_BROKE | SP_STATUS_HALT ; +#if defined(M64P_PLUGIN_API) if (GET_RSP_INFO(ProcessDlistList) == NULL) { /* branch */ } else GET_RSP_INFO(ProcessDlistList)(); +#else + if (GET_RSP_INFO(ProcessDList) == NULL) + { /* branch */ } + else + GET_RSP_INFO(ProcessDList)(); +#endif if ((GET_RCP_REG(SP_STATUS_REG) & SP_STATUS_INTR_BREAK) && (GET_RCP_REG(SP_STATUS_REG) & (SP_STATUS_SIG2 | SP_STATUS_BROKE | SP_STATUS_HALT))) { GET_RCP_REG(MI_INTR_REG) |= 0x00000001; @@ -310,16 +332,21 @@ EXPORT unsigned int CALL DoRspCycles(unsigned int cycles) } GET_RCP_REG(DPC_STATUS_REG) &= ~0x00000002ul; /* DPC_STATUS_FREEZE */ return 0; -#endif -#ifdef EXTERN_COMMAND_LIST_ABI case M_AUDTASK: if (CFG_HLE_AUD == 0) break; +#if defined(M64P_PLUGIN_API) if (GET_RSP_INFO(ProcessAlistList) == NULL) { /* branch */ } else GET_RSP_INFO(ProcessAlistList)(); +#else + if (GET_RSP_INFO(ProcessAList) == NULL) + { /* branch */ } + else + GET_RSP_INFO(ProcessAList)(); +#endif GET_RCP_REG(SP_STATUS_REG) |= SP_STATUS_SIG2 | SP_STATUS_BROKE | SP_STATUS_HALT @@ -329,7 +356,6 @@ EXPORT unsigned int CALL DoRspCycles(unsigned int cycles) GET_RSP_INFO(CheckInterrupts)(); } return 0; -#endif case M_VIDTASK: message("M_VIDTASK"); break; @@ -346,10 +372,15 @@ EXPORT unsigned int CALL DoRspCycles(unsigned int cycles) break; GET_RSP_INFO(ShowCFB)(); /* forced FB refresh in case gfx plugin skip */ break; + default: + if (task_type == 0x8BC43B5D) + break; /* CIC boot code sent to the RSP */ + sprintf(task_debug_type, "%08lX", (unsigned long)task_type); + message(task_debug); } #ifdef WAIT_FOR_CPU_HOST - for (i = 0; i < 32; i++) + for (i = 0; i < NUMBER_OF_SCALAR_REGISTERS; i++) MFC0_count[i] = 0; #endif run_task(); @@ -387,7 +418,7 @@ EXPORT void CALL GetDllInfo(PLUGIN_INFO *PluginInfo) { PluginInfo -> Version = PLUGIN_API_VERSION; PluginInfo -> Type = PLUGIN_TYPE_RSP; - my_strcpy(PluginInfo -> Name, "Static Interpreter"); + strcpy(PluginInfo -> Name, "Static Interpreter"); PluginInfo -> NormalMemory = 0; PluginInfo -> MemoryBswaped = USE_CLIENT_ENDIAN; return; @@ -406,6 +437,8 @@ void no_LLE(void) } EXPORT void CALL InitiateRSP(RSP_INFO Rsp_Info, pu32 CycleCount) { + int recovered_from_exception; + if (CycleCount != NULL) /* cycle-accuracy not doable with today's hosts */ *CycleCount = 0; update_conf(CFG_FILE); @@ -425,7 +458,7 @@ EXPORT void CALL InitiateRSP(RSP_INFO Rsp_Info, pu32 CycleCount) CR[0x5] = &GET_RCP_REG(SP_DMA_FULL_REG); CR[0x6] = &GET_RCP_REG(SP_DMA_BUSY_REG); CR[0x7] = &GET_RCP_REG(SP_SEMAPHORE_REG); - GET_RCP_REG(SP_PC_REG) = 0x04001000; + *(RSP_INFO_NAME.SP_PC_REG) = 0x04001000; CR[0x8] = &GET_RCP_REG(DPC_START_REG); CR[0x9] = &GET_RCP_REG(DPC_END_REG); CR[0xA] = &GET_RCP_REG(DPC_CURRENT_REG); @@ -443,6 +476,28 @@ EXPORT void CALL InitiateRSP(RSP_INFO Rsp_Info, pu32 CycleCount) GBI_phase = GET_RSP_INFO(ProcessRdpList); if (GBI_phase == NULL) GBI_phase = no_LLE; + + signal(SIGILL, ISA_op_illegal); +#ifndef _WIN32 + signal(SIGSEGV, seg_av_handler); + for (SR[ra] = 0; SR[ra] < 0x80000000ul; SR[ra] += 0x200000) { + recovered_from_exception = setjmp(CPU_state); + if (recovered_from_exception) + break; + SR[at] += DRAM[SR[ra]]; + } + for (SR[at] = 0; SR[at] < 31; SR[at]++) { + SR[ra] = (SR[ra] & ~1) >> 1; + if (SR[ra] == 0) + break; + } + su_max_address = (1 << SR[at]) - 1; +#endif + + if (su_max_address < 0x1FFFFFul) + su_max_address = 0x1FFFFFul; /* 2 MiB */ + if (su_max_address > 0xFFFFFFul) + su_max_address = 0xFFFFFFul; /* 16 MiB */ return; } @@ -455,9 +510,9 @@ EXPORT void CALL RomClosed(void) * If the config file wasn't installed correctly, politely shut errors up. */ #if !defined(M64P_PLUGIN_API) - FILE* stream = my_fopen(CFG_FILE, "wb"); - my_fwrite(conf, 8, 32 / 8, stream); - my_fclose(stream); + FILE* stream = fopen(CFG_FILE, "wb"); + fwrite(conf, 8, 32 / 8, stream); + fclose(stream); #endif return; } @@ -470,22 +525,22 @@ NOINLINE void message(const char* body) char* argv; int i, j; - argv = my_calloc(my_strlen(body) + 64, 1); - my_strcpy(argv, "CMD /Q /D /C \"TITLE RSP Message&&ECHO "); + argv = calloc(strlen(body) + 64, 1); + strcpy(argv, "CMD /Q /D /C \"TITLE RSP Message&&ECHO "); i = 0; - j = my_strlen(argv); + j = strlen(argv); while (body[i] != '\0') { if (body[i] == '\n') { - my_strcat(argv, "&&ECHO "); + strcat(argv, "&&ECHO "); ++i; j += 7; continue; } argv[j++] = body[i++]; } - my_strcat(argv, "&&PAUSE&&EXIT\""); - my_system(argv); - my_free(argv); + strcat(argv, "&&PAUSE&&EXIT\""); + system(argv); + free(argv); #else fputs(body, stdout); putchar('\n'); @@ -519,13 +574,13 @@ NOINLINE void update_conf(const char* source) for (i = 0; i < 32; i++) conf[i] = 0x00; - stream = my_fopen(source, "rb"); + stream = fopen(source, "rb"); if (stream == NULL) { message("Failed to read config."); return; } - my_fread(conf, 8, 32 / 8, stream); - my_fclose(stream); + fread(conf, 8, 32 / 8, stream); + fclose(stream); return; } #endif @@ -548,11 +603,11 @@ void step_SP_commands(uint32_t inst) sprintf(&offset[0], "%03X", GET_RCP_REG(SP_PC_REG) & 0xFFF); sprintf(&code[0], "%08X", inst); strcpy(text, offset); - my_strcat(text, "\n"); - my_strcat(text, code); + strcat(text, "\n"); + strcat(text, code); message(text); /* PC offset, MIPS hex. */ if (output_log != NULL) - my_fwrite(endian_swap, 4, 1, output_log); + fwrite(endian_swap, 4, 1, output_log); } #endif @@ -563,13 +618,13 @@ NOINLINE void export_data_cache(void) register int i; /* const int little_endian = GET_RSP_INFO(MemoryBswaped); */ - DMEM_swapped = my_calloc(4096, 1); + DMEM_swapped = calloc(4096, 1); for (i = 0; i < 4096; i++) DMEM_swapped[i] = DMEM[BES(i)]; - out = my_fopen("rcpcache.dhex", "wb"); - my_fwrite(DMEM_swapped, 16, 4096 / 16, out); - my_fclose(out); - my_free(DMEM_swapped); + out = fopen("rcpcache.dhex", "wb"); + fwrite(DMEM_swapped, 16, 4096 / 16, out); + fclose(out); + free(DMEM_swapped); return; } NOINLINE void export_instruction_cache(void) @@ -579,13 +634,13 @@ NOINLINE void export_instruction_cache(void) register int i; /* const int little_endian = GET_RSP_INFO(MemoryBswaped); */ - IMEM_swapped = my_calloc(4096, 1); + IMEM_swapped = calloc(4096, 1); for (i = 0; i < 4096; i++) IMEM_swapped[i] = IMEM[BES(i)]; - out = my_fopen("rcpcache.ihex", "wb"); - my_fwrite(IMEM_swapped, 16, 4096 / 16, out); - my_fclose(out); - my_free(IMEM_swapped); + out = fopen("rcpcache.ihex", "wb"); + fwrite(IMEM_swapped, 16, 4096 / 16, out); + fclose(out); + free(IMEM_swapped); return; } void export_SP_memory(void) @@ -597,189 +652,26 @@ void export_SP_memory(void) /* * Microsoft linker defaults to an entry point of `_DllMainCRTStartup', - * which attaches several CRT dependencies. To eliminate CRT dependencies, - * we direct the linker to cursor the entry point to the lower-level - * `DllMain' symbol or, alternatively, link with /NOENTRY for no entry point. + * which attaches several CRT dependencies. To eliminate linkage of unused + * startup CRT code, we direct the linker to use DllMain as the entry point. + * + * The same approach is taken with MinGW to get those weird MinGW-specific + * messages and unused initializer functions out of the plugin binary. */ -#ifdef WIN32 -BOOL WINAPI DllMain( - HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) +#ifdef _WIN32 +BOOL WINAPI +DllMain(HINSTANCE hModule, DWORD ul_reason_for_call, LPVOID lpReserved) { hModule = lpReserved = NULL; /* unused */ - switch (ul_reason_for_call) - { -case 1: /* DLL_PROCESS_ATTACH */ - break; -case 2: /* DLL_THREAD_ATTACH */ - break; -case 3: /* DLL_THREAD_DETACH */ - break; -case 0: /* DLL_PROCESS_DETACH */ + switch (ul_reason_for_call) { + case 1: /* DLL_PROCESS_ATTACH */ + case 2: /* DLL_THREAD_ATTACH */ + case 3: /* DLL_THREAD_DETACH */ + case 0: /* DLL_PROCESS_DETACH */ break; + default: + message("Unknown reason for call."); } - return 1; /* TRUE */ -} -#endif - -/* - * low-level recreations of the C standard library functions for operating - * systems that define a C run-time or dependency on top of fixed OS calls - * - * Currently, this only addresses Microsoft Windows. - * - * None of these are meant to out-perform the original functions, by the way - * (especially with better intrinsic compiler support for stuff like memcpy), - * just to cut down on I-cache use for performance-irrelevant code sections - * and to avoid std. lib run-time dependencies on certain operating systems. - */ - -NOINLINE p_void my_calloc(size_t count, size_t size) -{ -#ifdef WIN32 - return GlobalAlloc(GPTR, size * count); -#else - return calloc(count, size); -#endif -} - -NOINLINE void my_free(p_void ptr) -{ -#ifdef WIN32 - while (GlobalFree(ptr) != NULL) - message("GlobalFree() failure"); -#else - free(ptr); -#endif - return; -} - -NOINLINE size_t my_strlen(const char* str) -{ - size_t ret_slot; - - for (ret_slot = 0; *str != '\0'; ret_slot++, str++) - ; - return (ret_slot); -} - -NOINLINE char* my_strcpy(char* destination, const char* source) -{ - register size_t i; - const size_t length = my_strlen(source) + 1; /* including null terminator */ - - for (i = 0; i < length; i++) - destination[i] = source[i]; - return (destination); -} - -NOINLINE char* my_strcat(char* destination, const char* source) -{ - const size_t length = my_strlen(destination); - - my_strcpy(destination + length, source); - return (destination); -} - -NOINLINE int my_system(char* command) -{ - int ret_slot; -#ifdef WIN32 - static STARTUPINFOA info; - static PROCESS_INFORMATION info_process; - - info.cb = sizeof(info); - info.dwFillAttribute = - FOREGROUND_RED | FOREGROUND_GREEN | FOREGROUND_INTENSITY; - info.dwFlags = STARTF_USEFILLATTRIBUTE | STARTF_USECOUNTCHARS; - - info.dwXCountChars = 80; - info.dwYCountChars = 20; - - ret_slot = CreateProcessA( - NULL, - command, - NULL, - NULL, - FALSE, - 0x00000000, - NULL, - NULL, - &info, - &info_process - ); - - WaitForSingleObject(info_process.hProcess, INFINITE); - CloseHandle(info_process.hProcess); - CloseHandle(info_process.hThread); -#elif TARGET_OS_IPHONE || TARGET_OS_TV - // system not available in iOS - ret_slot = 0; -#else - ret_slot = system(command); -#endif - return (ret_slot); -} - -NOINLINE FILE* my_fopen(const char * filename, const char* mode) -{ -#ifdef WIN32 -#if 0 - if (mode[1] != 'b') - return NULL; /* non-binary yet to be supported? */ -#endif - return (FILE *)(HANDLE)CreateFileA( - filename, - (mode[0] == 'r') ? GENERIC_READ : GENERIC_WRITE, - (mode[0] == 'r') ? FILE_SHARE_READ : FILE_SHARE_WRITE, - NULL, - (mode[0] == 'r') ? OPEN_EXISTING : CREATE_ALWAYS, -#if 0 - FILE_FLAG_WRITE_THROUGH | FILE_FLAG_OVERLAPPED | FILE_FLAG_NO_BUFFERING, -#else - (mode[0] == 'r') ? FILE_ATTRIBUTE_NORMAL : FILE_FLAG_WRITE_THROUGH, -#endif - NULL - ); -#else - return fopen(filename, mode); -#endif -} - -NOINLINE int my_fclose(FILE* stream) -{ - int ret_slot; -#ifdef WIN32 - ret_slot = !CloseHandle((HANDLE)stream); -#else - ret_slot = fclose(stream); -#endif - return (ret_slot); -} - -NOINLINE size_t my_fread(p_void ptr, size_t size, size_t count, FILE* stream) -{ -#ifdef WIN32 - DWORD ret_slot; - - ReadFile((HANDLE)stream, ptr, size * count, &ret_slot, NULL); -#else - size_t ret_slot; - - ret_slot = fread(ptr, size, count, stream); -#endif - return (size_t)(ret_slot); -} - -NOINLINE size_t my_fwrite(p_void ptr, size_t size, size_t count, FILE* stream) -{ -#ifdef WIN32 - DWORD ret_slot; - - WriteFile((HANDLE)stream, ptr, size * count, &ret_slot, NULL); -#else - size_t ret_slot; - - ret_slot = fwrite(ptr, size, count, stream); -#endif - return (size_t)(ret_slot); + return TRUE; } +#endif \ No newline at end of file diff --git a/module.h b/module.h index 57711995..f889006b 100644 --- a/module.h +++ b/module.h @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: Module Subsystem Interface to SP Interpreter Core * * Authors: Iconoclast * -* Release: 2016.11.05 * +* Release: 2018.03.17 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -26,7 +26,9 @@ typedef enum { M_NJPEGTASK = 4, M_NULTASK = 5, M_HVQTASK = 6, - M_HVQMTASK = 7 + M_HVQMTASK = 7, + + NUM_KNOWN_TASK_TYPES } OSTask_type; #define CFG_FILE "rsp_conf.bin" @@ -87,21 +89,4 @@ extern void step_SP_commands(u32 inst); #endif extern void export_SP_memory(void); -/* - * low-level recreations of the C standard library functions for operating - * systems that provide an inconvenient C run-time ecosystem, like Windows - */ -NOINLINE extern p_void my_calloc(size_t count, size_t size); -NOINLINE extern void my_free(p_void ptr); -NOINLINE extern size_t my_strlen(const char* str); -NOINLINE extern char* my_strcpy(char* destination, const char* source); -NOINLINE extern char* my_strcat(char* destination, const char* source); -NOINLINE extern int my_system(char* command); -NOINLINE extern FILE* my_fopen(const char * filename, const char* mode); -NOINLINE extern int my_fclose(FILE* stream); -NOINLINE extern size_t my_fread( - p_void ptr, size_t size, size_t count, FILE* stream); -NOINLINE extern size_t my_fwrite( - p_void ptr, size_t size, size_t count, FILE* stream); - #endif diff --git a/su.c b/su.c index 97ef2618..7402c7bd 100644 --- a/su.c +++ b/su.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: MSP Simulation Layer for Scalar Unit Operations * * Authors: Iconoclast * -* Release: 2016.11.05 * +* Release: 2018.03.17 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -21,14 +21,18 @@ */ #include "module.h" +/* memcpy() and memset() in SP DMA */ +#include + u32 inst_word; -u32 SR[32]; +u32 SR[NUMBER_OF_SCALAR_REGISTERS]; typedef VECTOR_OPERATION(*p_vector_func)(v16, v16); pu8 DRAM; pu8 DMEM; pu8 IMEM; +unsigned long su_max_address = 0x007FFFFFul; NOINLINE void res_S(void) { @@ -98,36 +102,41 @@ static void MT_SP_STATUS(unsigned int rt) pu32 SP_STATUS_REG; if (SR[rt] & 0xFE000040) - message("MTC0\nSP_STATUS"); - MI_INTR_REG = GET_RSP_INFO(MI_INTR_REG); + message("MTC0\nSP_STATUS"); /* bits we don't know what to do with */ SP_STATUS_REG = GET_RSP_INFO(SP_STATUS_REG); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000001) << 0); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00000002) << 0); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000004) << 1); - *MI_INTR_REG &= ~((SR[rt] & 0x00000008) >> 3); /* SP_CLR_INTR */ - *MI_INTR_REG |= ((SR[rt] & 0x00000010) >> 4); /* SP_SET_INTR */ - *SP_STATUS_REG |= (SR[rt] & 0x00000010) >> 4; /* int set halt */ + /* DMA_BUSY, DMA_FULL, IO_FULL: No feature exists to clear these. */ *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000020) << 5); - /* *SP_STATUS_REG |= (!!(SR[rt] & 0x00000040) << 5); */ *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000080) << 6); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00000100) << 6); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000200) << 7); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00000400) << 7); /* yield request? */ *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00000800) << 8); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00001000) << 8); /* yielded? */ *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00002000) << 9); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00004000) << 9); /* task done? */ *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00008000) << 10); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00010000) << 10); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00020000) << 11); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00040000) << 11); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00080000) << 12); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00100000) << 12); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00200000) << 13); - *SP_STATUS_REG |= (!!(SR[rt] & 0x00400000) << 13); *SP_STATUS_REG &= ~(!!(SR[rt] & 0x00800000) << 14); + + *SP_STATUS_REG |= (!!(SR[rt] & 0x00000002) << 0); + /* No feature exists to set BROKE: (!!1 << 1) */ + /* DMA_BUSY, DMA_FULL, IO_FULL: No feature exists to set these. */ + *SP_STATUS_REG |= (!!(SR[rt] & 0x00000040) << 5); + *SP_STATUS_REG |= (!!(SR[rt] & 0x00000100) << 6); + *SP_STATUS_REG |= (!!(SR[rt] & 0x00000400) << 7); /* yield request? */ + *SP_STATUS_REG |= (!!(SR[rt] & 0x00001000) << 8); /* yielded? */ + *SP_STATUS_REG |= (!!(SR[rt] & 0x00004000) << 9); /* task done? */ + *SP_STATUS_REG |= (!!(SR[rt] & 0x00010000) << 10); + *SP_STATUS_REG |= (!!(SR[rt] & 0x00040000) << 11); + *SP_STATUS_REG |= (!!(SR[rt] & 0x00100000) << 12); + *SP_STATUS_REG |= (!!(SR[rt] & 0x00400000) << 13); *SP_STATUS_REG |= (!!(SR[rt] & 0x01000000) << 14); + + MI_INTR_REG = GET_RSP_INFO(MI_INTR_REG); + *MI_INTR_REG &= ~((SR[rt] & 0x00000008) >> 3); /* SP_CLR_INTR */ + *MI_INTR_REG |= ((SR[rt] & 0x00000010) >> 4); /* SP_SET_INTR */ + *SP_STATUS_REG |= (SR[rt] & 0x00000010) >> 4; /* int set halt */ return; } static void MT_SP_RESERVED(unsigned int rt) @@ -225,11 +234,12 @@ void SP_DMA_READ(void) do { offC = (count*length + *CR[0x0] + i) & 0x00001FF8ul; offD = (count*skip + *CR[0x1] + i) & 0x00FFFFF8ul; - *(pi64)(DMEM + offC) = - *(pi64)(DRAM + offD) - & (offD & ~MAX_DRAM_DMA_ADDR ? 0 : ~0) /* 0 if (addr > limit) */ - ; i += 0x008; + if (offD > su_max_address) { + memset(DMEM + offC, 0x00, 8); + continue; + } + memcpy(DMEM + offC, DRAM + offD, 8); } while (i < length); } while (count); @@ -264,8 +274,10 @@ void SP_DMA_WRITE(void) do { offC = (count*length + *CR[0x0] + i) & 0x00001FF8ul; offD = (count*skip + *CR[0x1] + i) & 0x00FFFFF8ul; - *(pi64)(DRAM + offD) = *(pi64)(DMEM + offC); i += 0x000008; + if (offD > su_max_address) + continue; + memcpy(DRAM + offD, DMEM + offC, 8); } while (i < length); } while (count); @@ -825,30 +837,12 @@ void SDV(unsigned vt, unsigned element, signed offset, unsigned base) return; } -static char transfer_debug[32] = "?WC2 $v00[0x0], 0x000($00)"; -static const char digits[16] = { - '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F' -}; - -NOINLINE void res_lsw( - unsigned vt, - unsigned element, - signed offset, - unsigned base) +NOINLINE void +res_lsw(unsigned vt, unsigned element, signed offset, unsigned base) { - transfer_debug[10] = '0' + (unsigned char)vt/10; - transfer_debug[11] = '0' + (unsigned char)vt%10; - - transfer_debug[15] = digits[element & 0xF]; - - transfer_debug[21] = digits[(offset & 0xFFF) >> 8]; - transfer_debug[22] = digits[(offset & 0x0FF) >> 4]; - transfer_debug[23] = digits[(offset & 0x00F) >> 0]; - - transfer_debug[26] = '0' + (unsigned char)base/10; - transfer_debug[27] = '0' + (unsigned char)base%10; - - message(transfer_debug); + message("Reserved vector unit transfer operation."); + if (vt != element + base || offset != 0) /* unused parameters */ + return; return; } @@ -1652,7 +1646,7 @@ void STV(unsigned vt, unsigned element, signed offset, unsigned base) int temp_PC; #ifdef WAIT_FOR_CPU_HOST -short MFC0_count[32]; +short MFC0_count[NUMBER_OF_SCALAR_REGISTERS]; #endif mwc2_func LWC2[2 * 8*2] = { @@ -1828,7 +1822,7 @@ PROFILE_MODE void MWC2_load(u32 inst) offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */ offset >>= 5 + 4; #else - offset = (inst & 64) ? -(s16)(~inst%64 + 1) : inst % 64; + offset = (inst & 64) ? -(s16)(~inst%64 + 1) : (s16)(inst % 64); #endif LWC2[IW_RD(inst)](vt, element, offset, base); } @@ -1844,7 +1838,7 @@ PROFILE_MODE void MWC2_store(u32 inst) offset <<= 5 + 4; /* safe on x86, skips 5-bit rd, 4-bit element */ offset >>= 5 + 4; #else - offset = (inst & 64) ? -(s16)(~inst%64 + 1) : inst % 64; + offset = (inst & 64) ? -(s16)(~inst%64 + 1) : (s16)(inst % 64); #endif SWC2[IW_RD(inst)](vt, element, offset, base); } diff --git a/su.h b/su.h index 0e46af16..f0f7d430 100644 --- a/su.h +++ b/su.h @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: Basic MIPS R4000 Instruction Set for Scalar Unit Operations * * Authors: Iconoclast * -* Release: 2016.11.05 * +* Release: 2018.03.17 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -22,8 +22,6 @@ #include "my_types.h" #include "rsp.h" -#define EXTERN_COMMAND_LIST_GBI -#define EXTERN_COMMAND_LIST_ABI #define SEMAPHORE_LOCK_CORRECTIONS #define WAIT_FOR_CPU_HOST @@ -34,10 +32,10 @@ /* * Currently, the plugin system this module is written for doesn't notify us - * of how much RDRAM is installed to the system, so we have to presume 8 MiB. + * of how much RDRAM is installed to the system, so we'll use signal handlers + * to catch memory segment access faults in the trial search to find it out. */ -#define MAX_DRAM_ADDR 0x007FFFFFul -#define MAX_DRAM_DMA_ADDR (MAX_DRAM_ADDR & ~7) +extern unsigned long su_max_address; /* * Interact with memory using server-side byte order (MIPS big-endian) or @@ -79,7 +77,9 @@ typedef enum { zero = 0, + at = 1, + #ifdef TRUE_MIPS_AND_NOT_JUST_THE_RSP_SUBSET v0 = 2, v1 = 3, @@ -117,7 +117,9 @@ typedef enum { sp = 29, fp = 30, /* new, official MIPS name for it: "frame pointer" */ ra = 31, - S8 = fp + + NUMBER_OF_SCALAR_REGISTERS, + S8 = fp /* older name for GPR $fp as of the R4000 ISA */ } GPR_specifier; extern RSP_INFO RSP_INFO_NAME; @@ -125,7 +127,7 @@ extern pu8 DRAM; extern pu8 DMEM; extern pu8 IMEM; -extern u8 conf[32]; +extern u8 conf[]; /* * general-purpose scalar registers @@ -133,7 +135,7 @@ extern u8 conf[32]; * based on the MIPS instruction set architecture but without most of the * original register names (for example, no kernel-reserved registers) */ -extern u32 SR[32]; +extern u32 SR[]; #define FIT_IMEM(PC) ((PC) & 0xFFFu & 0xFFCu) @@ -155,7 +157,7 @@ int stage; extern int temp_PC; #ifdef WAIT_FOR_CPU_HOST -extern short MFC0_count[32]; +extern short MFC0_count[]; /* Keep one C0 MF status read count for each scalar register. */ #endif @@ -266,8 +268,28 @@ extern void set_PC(unsigned int address); #define SP_STATUS_SIG6 (0x00000001ul << 13) #define SP_STATUS_SIG7 (0x00000001ul << 14) -#define NUMBER_OF_CP0_REGISTERS 16 -extern pu32 CR[NUMBER_OF_CP0_REGISTERS]; +enum { + RCP_SP_MEM_ADDR_REG, + RCP_SP_DRAM_ADDR_REG, + RCP_SP_RD_LEN_REG, + RCP_SP_WR_LEN_REG, + RCP_SP_STATUS_REG, + RCP_SP_DMA_FULL_REG, + RCP_SP_DMA_BUSY_REG, + RCP_SP_SEMAPHORE_REG, + + RCP_DPC_START_REG, + RCP_DPC_END_REG, + RCP_DPC_CURRENT_REG, + RCP_DPC_STATUS_REG, + RCP_DPC_CLOCK_REG, + RCP_DPC_BUFBUSY_REG, + RCP_DPC_PIPEBUSY_REG, + RCP_DPC_TMEM_REG, + + NUMBER_OF_CP0_REGISTERS +} CPR_specifier; +extern pu32 CR[]; extern void SP_DMA_READ(void); extern void SP_DMA_WRITE(void); diff --git a/vu/add.c b/vu/add.c index 06e2fa4d..836f414b 100644 --- a/vu/add.c +++ b/vu/add.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: MSP Simulation Layer for Vector Unit Computational Adds * * Authors: Iconoclast * -* Release: 2016.03.23 * +* Release: 2018.03.18 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -72,7 +72,7 @@ static INLINE void SIGNED_CLAMP_ADD(pi16 VD, pi16 VS, pi16 VT) { i32 sum[N]; i16 hi[N], lo[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) sum[i] = VS[i] + VT[i] + cf_co[i]; @@ -93,7 +93,7 @@ static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) { i32 dif[N]; i16 hi[N], lo[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) dif[i] = VS[i] - VT[i] - cf_co[i]; @@ -114,7 +114,7 @@ static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT) { /* clear CARRY and carry in to accumulators */ - register int i; + register unsigned int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] + VT[i] + cf_co[i]; @@ -128,7 +128,7 @@ INLINE static void clr_ci(pi16 VD, pi16 VS, pi16 VT) INLINE static void clr_bi(pi16 VD, pi16 VS, pi16 VT) { /* clear CARRY and borrow in to accumulators */ - register int i; + register unsigned int i; for (i = 0; i < N; i++) VACC_L[i] = VS[i] - VT[i] - cf_co[i]; @@ -151,7 +151,7 @@ INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT) i16 neg[N], pos[N]; i16 nez[N], cch[N]; /* corner case hack -- abs(-32768) == +32767 */ ALIGNED i16 res[N]; - register int i; + register unsigned int i; vector_copy(res, VT); for (i = 0; i < N; i++) @@ -180,7 +180,7 @@ INLINE static void do_abs(pi16 VD, pi16 VS, pi16 VT) INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT) { /* set CARRY and carry out from sum */ i32 sum[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) sum[i] = (u16)(VS[i]) + (u16)(VT[i]); @@ -197,7 +197,7 @@ INLINE static void set_co(pi16 VD, pi16 VS, pi16 VT) INLINE static void set_bo(pi16 VD, pi16 VS, pi16 VT) { /* set CARRY and borrow out from difference */ i32 dif[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) dif[i] = (u16)(VS[i]) - (u16)(VT[i]); diff --git a/vu/multiply.c b/vu/multiply.c index d2ccd70f..dcf9504b 100644 --- a/vu/multiply.c +++ b/vu/multiply.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: MSP Simulation Layer for Vector Unit Computational Multiplies * * Authors: Iconoclast * -* Release: 2015.11.30 * +* Release: 2018.03.18 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -16,35 +16,24 @@ #include "multiply.h" #ifdef ARCH_MIN_SSE2 -#define _mm_cmple_epu16(dst, src) \ - _mm_cmpeq_epi16(_mm_subs_epu16(dst, src), _mm_setzero_si128()) -#define _mm_cmpgt_epu16(dst, src) \ - _mm_andnot_si128(_mm_cmpeq_epi16(dst, src), _mm_cmple_epu16(src, dst)) -#define _mm_cmplt_epu16(dst, src) \ - _mm_cmpgt_epu16(src, dst) -#define _mm_mullo_epu16(dst, src) \ - _mm_mullo_epi16(dst, src) +#define _mm_allones_si128() \ + _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) +#define _mm_setmin_epi16() \ + _mm_slli_epi16(_mm_allones_si128(), 15) -static INLINE void SIGNED_CLAMP_AM(pi16 VD) -{ /* typical sign-clamp of accumulator-mid (bits 31:16) */ - v16 dst, src; - v16 pvd, pvs; +#define _mm_cmplt_epu16(dst, src) \ + _mm_cmplt_epi16( \ + _mm_xor_si128(dst, _mm_setmin_epi16()), \ + _mm_xor_si128(src, _mm_setmin_epi16()) \ + ) - pvs = _mm_load_si128((v16 *)VACC_H); - pvd = _mm_load_si128((v16 *)VACC_M); - dst = _mm_unpacklo_epi16(pvd, pvs); - src = _mm_unpackhi_epi16(pvd, pvs); - - dst = _mm_packs_epi32(dst, src); - _mm_store_si128((v16 *)VD, dst); - return; -} #else + static INLINE void SIGNED_CLAMP_AM(pi16 VD) { /* typical sign-clamp of accumulator-mid (bits 31:16) */ i16 hi[N], lo[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) lo[i] = (VACC_H[i] < ~0); @@ -61,15 +50,13 @@ static INLINE void SIGNED_CLAMP_AM(pi16 VD) VD[i] |= -(hi[i] ^ 0); for (i = 0; i < N; i++) VD[i] ^= 0x8000 * (hi[i] | lo[i]); - return; } -#endif static INLINE void UNSIGNED_CLAMP(pi16 VD) { /* sign-zero hybrid clamp of accumulator-mid (bits 31:16) */ ALIGNED i16 temp[N]; i16 cond[N]; - register int i; + register unsigned int i; SIGNED_CLAMP_AM(temp); /* no direct map in SSE, but closely based on this */ for (i = 0; i < N; i++) @@ -78,14 +65,13 @@ static INLINE void UNSIGNED_CLAMP(pi16 VD) VD[i] = temp[i] & ~(temp[i] >> 15); /* Only this clamp is unsigned. */ for (i = 0; i < N; i++) VD[i] = VD[i] | cond[i]; - return; } static INLINE void SIGNED_CLAMP_AL(pi16 VD) { /* sign-clamp accumulator-low (bits 15:0) */ ALIGNED i16 temp[N]; i16 cond[N]; - register int i; + register unsigned int i; SIGNED_CLAMP_AM(temp); /* no direct map in SSE, but closely based on this */ for (i = 0; i < N; i++) @@ -94,64 +80,8 @@ static INLINE void SIGNED_CLAMP_AL(pi16 VD) temp[i] ^= 0x8000; /* clamps 0x0000:0xFFFF instead of -0x8000:+0x7FFF */ for (i = 0; i < N; i++) VD[i] = (cond[i] ? temp[i] : VACC_L[i]); - return; -} - -INLINE static void do_macf(pi16 VD, pi16 VS, pi16 VT) -{ - i32 product[N]; - u32 addend[N]; - register int i; - - for (i = 0; i < N; i++) - product[i] = VS[i] * VT[i]; - for (i = 0; i < N; i++) - addend[i] = (product[i] << 1) & 0x00000000FFFF; - for (i = 0; i < N; i++) - addend[i] = (u16)(VACC_L[i]) + addend[i]; - for (i = 0; i < N; i++) - VACC_L[i] = (i16)(addend[i]); - for (i = 0; i < N; i++) - addend[i] = (addend[i] >> 16) + (u16)(product[i] >> 15); - for (i = 0; i < N; i++) - addend[i] = (u16)(VACC_M[i]) + addend[i]; - for (i = 0; i < N; i++) - VACC_M[i] = (i16)(addend[i]); - for (i = 0; i < N; i++) - VACC_H[i] -= (product[i] < 0); - for (i = 0; i < N; i++) - VACC_H[i] += addend[i] >> 16; - SIGNED_CLAMP_AM(VD); - return; -} - -INLINE static void do_macu(pi16 VD, pi16 VS, pi16 VT) -{ - i32 product[N]; - u32 addend[N]; - register int i; - - for (i = 0; i < N; i++) - product[i] = VS[i] * VT[i]; - for (i = 0; i < N; i++) - addend[i] = (product[i] << 1) & 0x00000000FFFF; - for (i = 0; i < N; i++) - addend[i] = (u16)(VACC_L[i]) + addend[i]; - for (i = 0; i < N; i++) - VACC_L[i] = (i16)(addend[i]); - for (i = 0; i < N; i++) - addend[i] = (addend[i] >> 16) + (u16)(product[i] >> 15); - for (i = 0; i < N; i++) - addend[i] = (u16)(VACC_M[i]) + addend[i]; - for (i = 0; i < N; i++) - VACC_M[i] = (i16)(addend[i]); - for (i = 0; i < N; i++) - VACC_H[i] -= (product[i] < 0); - for (i = 0; i < N; i++) - VACC_H[i] += addend[i] >> 16; - UNSIGNED_CLAMP(VD); - return; } +#endif VECTOR_OPERATION VMULF(v16 vs, v16 vt) { @@ -202,8 +132,7 @@ VECTOR_OPERATION VMULF(v16 vs, v16 vt) negative = _mm_xor_si128(negative, vs); *(v16 *)VACC_H = negative; /* 2*i16*i16 only fills L/M; VACC_H = 0 or ~0. */ - vs = _mm_add_epi16(vs, prod_hi); /* prod_hi must be -32768; + -1 = +32767 */ - return (vs); + return _mm_add_epi16(vs, prod_hi); /* prod_hi must be -32768; - 1 = +32767 */ #else word_64 product[N]; /* (-32768 * -32768)<<1 + 32768 confuses 32-bit type. */ register unsigned int i; @@ -221,7 +150,6 @@ VECTOR_OPERATION VMULF(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] = -(product[i].SW < 0); /* product>>32 & 0xFFFF */ SIGNED_CLAMP_AM(V_result); - return; #endif } @@ -266,8 +194,7 @@ VECTOR_OPERATION VMULU(v16 vs, v16 vt) prod_lo = _mm_srai_epi16(prod_hi, 15); /* unsigned overflow mask */ vs = _mm_or_si128(prod_hi, prod_lo); - vs = _mm_andnot_si128(negative, vs); /* unsigned underflow mask */ - return (vs); + return _mm_andnot_si128(negative, vs); /* unsigned underflow mask */ #else word_64 product[N]; /* (-32768 * -32768)<<1 + 32768 confuses 32-bit type. */ register unsigned int i; @@ -285,7 +212,6 @@ VECTOR_OPERATION VMULU(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] = -(product[i].SW < 0); /* product>>32 & 0xFFFF */ UNSIGNED_CLAMP(V_result); - return; #endif } @@ -309,7 +235,6 @@ VECTOR_OPERATION VMUDL(v16 vs, v16 vt) vector_copy(V_result, VACC_L); vector_wipe(VACC_M); vector_wipe(VACC_H); - return; #endif } @@ -348,7 +273,6 @@ VECTOR_OPERATION VMUDM(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] = -(VACC_M[i] < 0); vector_copy(V_result, VACC_M); - return; #endif } @@ -372,7 +296,7 @@ VECTOR_OPERATION VMUDN(v16 vs, v16 vt) *(v16 *)VACC_M = prod_hi; prod_hi = _mm_srai_epi16(prod_hi, 15); *(v16 *)VACC_H = prod_hi; - return (vs = prod_lo); + return (prod_lo); #else word_32 product[N]; register unsigned int i; @@ -386,7 +310,6 @@ VECTOR_OPERATION VMUDN(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] = -(VACC_M[i] < 0); vector_copy(V_result, VACC_L); - return; #endif } @@ -413,8 +336,7 @@ VECTOR_OPERATION VMUDH(v16 vs, v16 vt) * Re-interleave or pack both 32-bit products in both xmm registers with * signed saturation: prod < -32768 to -32768 and prod > +32767 to +32767. */ - vs = _mm_packs_epi32(vs, vt); - return (vs); + return _mm_packs_epi32(vs, vt); #else word_32 product[N]; register unsigned int i; @@ -427,57 +349,143 @@ VECTOR_OPERATION VMUDH(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] = (s16)(product[i].W >> 16); /* product[i].HW[HES(2) >> 1] */ SIGNED_CLAMP_AM(V_result); - return; #endif } VECTOR_OPERATION VMACF(v16 vs, v16 vt) { - ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 - ALIGNED i16 VS[N], VT[N]; + v16 acc_hi, acc_md, acc_lo; + v16 prod_hi, prod_lo; + v16 overflow, overflow_new; + v16 prod_neg, carry; - *(v16 *)VS = vs; - *(v16 *)VT = vt; -#else - v16 VS, VT; + prod_hi = _mm_mulhi_epi16(vs, vt); + prod_lo = _mm_mullo_epi16(vs, vt); + prod_neg = _mm_srli_epi16(prod_hi, 15); - VS = vs; - VT = vt; -#endif - do_macf(VD, VS, VT); -#ifdef ARCH_MIN_SSE2 - COMPILER_FENCE(); - vs = *(v16 *)VD; - return (vs); + /* fractional adjustment by shifting left one bit */ + overflow = _mm_srli_epi16(prod_lo, 15); /* hi bit lost when s16 += s16 */ + prod_lo = _mm_add_epi16(prod_lo, prod_lo); + prod_hi = _mm_add_epi16(prod_hi, prod_hi); + prod_hi = _mm_or_si128(prod_hi, overflow); /* Carry lo's MSB to hi's LSB. */ + + acc_lo = *(v16 *)VACC_L; + acc_md = *(v16 *)VACC_M; + acc_hi = *(v16 *)VACC_H; + + acc_lo = _mm_add_epi16(acc_lo, prod_lo); + *(v16 *)VACC_L = acc_lo; + overflow = _mm_cmplt_epu16(acc_lo, prod_lo); /* a + b < a + 0 ? ~0 : 0 */ + + acc_md = _mm_add_epi16(acc_md, prod_hi); + overflow_new = _mm_cmplt_epu16(acc_md, prod_hi); + acc_md = _mm_sub_epi16(acc_md, overflow); /* m - (overflow = ~0) == m + 1 */ + carry = _mm_cmpeq_epi16(acc_md, _mm_setzero_si128()); + carry = _mm_and_si128(carry, overflow); /* ~0 - (-1) == 0 && (-1) != 0 */ + *(v16 *)VACC_M = acc_md; + overflow = _mm_or_si128(carry, overflow_new); + + acc_hi = _mm_sub_epi16(acc_hi, overflow); + acc_hi = _mm_sub_epi16(acc_hi, prod_neg); + *(v16 *)VACC_H = acc_hi; + + vt = _mm_unpackhi_epi16(acc_md, acc_hi); + vs = _mm_unpacklo_epi16(acc_md, acc_hi); + return _mm_packs_epi32(vs, vt); #else - vector_copy(V_result, VD); - return; + word_32 product[N], addend[N]; + register unsigned int i; + + for (i = 0; i < N; i++) + product[i].SW = vs[i] * vt[i]; + for (i = 0; i < N; i++) + addend[i].UW = (product[i].SW << 1) & 0x00000000FFFF; + for (i = 0; i < N; i++) + addend[i].UW = (u16)(VACC_L[i]) + addend[i].UW; + for (i = 0; i < N; i++) + VACC_L[i] = (i16)(addend[i].UW); + for (i = 0; i < N; i++) + addend[i].UW = (addend[i].UW >> 16) + (u16)(product[i].SW >> 15); + for (i = 0; i < N; i++) + addend[i].UW = (u16)(VACC_M[i]) + addend[i].UW; + for (i = 0; i < N; i++) + VACC_M[i] = (i16)(addend[i].UW); + for (i = 0; i < N; i++) + VACC_H[i] -= (product[i].SW < 0); + for (i = 0; i < N; i++) + VACC_H[i] += addend[i].UW >> 16; + SIGNED_CLAMP_AM(V_result); #endif } VECTOR_OPERATION VMACU(v16 vs, v16 vt) { - ALIGNED i16 VD[N]; #ifdef ARCH_MIN_SSE2 - ALIGNED i16 VS[N], VT[N]; + v16 acc_hi, acc_md, acc_lo; + v16 prod_hi, prod_lo; + v16 overflow, overflow_new; + v16 prod_neg, carry; - *(v16 *)VS = vs; - *(v16 *)VT = vt; -#else - v16 VS, VT; + prod_hi = _mm_mulhi_epi16(vs, vt); + prod_lo = _mm_mullo_epi16(vs, vt); + prod_neg = _mm_srli_epi16(prod_hi, 15); - VS = vs; - VT = vt; -#endif - do_macu(VD, VS, VT); -#ifdef ARCH_MIN_SSE2 - COMPILER_FENCE(); - vs = *(v16 *)VD; - return (vs); + /* fractional adjustment by shifting left one bit */ + overflow = _mm_srli_epi16(prod_lo, 15); /* hi bit lost when s16 += s16 */ + prod_lo = _mm_add_epi16(prod_lo, prod_lo); + prod_hi = _mm_add_epi16(prod_hi, prod_hi); + prod_hi = _mm_or_si128(prod_hi, overflow); /* Carry lo's MSB to hi's LSB. */ + + acc_lo = *(v16 *)VACC_L; + acc_md = *(v16 *)VACC_M; + acc_hi = *(v16 *)VACC_H; + + acc_lo = _mm_add_epi16(acc_lo, prod_lo); + *(v16 *)VACC_L = acc_lo; + overflow = _mm_cmplt_epu16(acc_lo, prod_lo); /* a + b < a + 0 ? ~0 : 0 */ + + acc_md = _mm_add_epi16(acc_md, prod_hi); + overflow_new = _mm_cmplt_epu16(acc_md, prod_hi); + acc_md = _mm_sub_epi16(acc_md, overflow); /* m - (overflow = ~0) == m + 1 */ + carry = _mm_cmpeq_epi16(acc_md, _mm_setzero_si128()); + carry = _mm_and_si128(carry, overflow); /* ~0 - (-1) == 0 && (-1) != 0 */ + *(v16 *)VACC_M = acc_md; + overflow = _mm_or_si128(carry, overflow_new); + + acc_hi = _mm_sub_epi16(acc_hi, overflow); + acc_hi = _mm_sub_epi16(acc_hi, prod_neg); + *(v16 *)VACC_H = acc_hi; + + vt = _mm_unpackhi_epi16(acc_md, acc_hi); + vs = _mm_unpacklo_epi16(acc_md, acc_hi); + vs = _mm_packs_epi32(vs, vt); + overflow = _mm_cmplt_epi16(acc_md, vs); + vs = _mm_andnot_si128(_mm_srai_epi16(vs, 15), vs); + return _mm_or_si128(vs, overflow); #else - vector_copy(V_result, VD); - return; + word_32 product[N], addend[N]; + register unsigned int i; + + for (i = 0; i < N; i++) + product[i].SW = vs[i] * vt[i]; + for (i = 0; i < N; i++) + addend[i].UW = (product[i].SW << 1) & 0x00000000FFFF; + for (i = 0; i < N; i++) + addend[i].UW = (u16)(VACC_L[i]) + addend[i].UW; + for (i = 0; i < N; i++) + VACC_L[i] = (i16)(addend[i].UW); + for (i = 0; i < N; i++) + addend[i].UW = (addend[i].UW >> 16) + (u16)(product[i].SW >> 15); + for (i = 0; i < N; i++) + addend[i].UW = (u16)(VACC_M[i]) + addend[i].UW; + for (i = 0; i < N; i++) + VACC_M[i] = (i16)(addend[i].UW); + for (i = 0; i < N; i++) + VACC_H[i] -= (product[i].SW < 0); + for (i = 0; i < N; i++) + VACC_H[i] += addend[i].UW >> 16; + UNSIGNED_CLAMP(V_result); #endif } @@ -488,7 +496,7 @@ VECTOR_OPERATION VMADL(v16 vs, v16 vt) v16 prod_hi; v16 overflow, overflow_new; - /* prod_lo = _mm_mullo_epu16(vs, vt); */ + /* prod_lo = _mm_mullo_epi16(vs, vt); */ prod_hi = _mm_mulhi_epu16(vs, vt); acc_lo = *(v16 *)VACC_L; @@ -531,8 +539,7 @@ VECTOR_OPERATION VMADL(v16 vs, v16 vt) vs = _mm_and_si128(vs, acc_md); /* ... ? VS_clamped : 0x0000 */ vs = _mm_or_si128(vs, acc_lo); /* : acc_lo */ acc_md = _mm_slli_epi16(acc_md, 15); /* ... ? ^ 0x8000 : ^ 0x0000 */ - vs = _mm_xor_si128(vs, acc_md); /* Stupid unsigned-clamp-ish adjustment. */ - return (vs); + return _mm_xor_si128(vs, acc_md); /* stupid unsigned-clamp-ish adjustment */ #else word_32 product[N], addend[N]; register unsigned int i; @@ -552,7 +559,6 @@ VECTOR_OPERATION VMADL(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] += addend[i].UW >> 16; SIGNED_CLAMP_AL(V_result); - return; #endif } @@ -594,8 +600,7 @@ VECTOR_OPERATION VMADM(v16 vs, v16 vt) vt = _mm_unpackhi_epi16(acc_md, acc_hi); vs = _mm_unpacklo_epi16(acc_md, acc_hi); - vs = _mm_packs_epi32(vs, vt); - return (vs); + return _mm_packs_epi32(vs, vt); #else word_32 product[N], addend[N]; register unsigned int i; @@ -615,7 +620,6 @@ VECTOR_OPERATION VMADM(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] += addend[i].UW >> 16; SIGNED_CLAMP_AM(V_result); - return; #endif } @@ -675,8 +679,7 @@ VECTOR_OPERATION VMADN(v16 vs, v16 vt) vs = _mm_and_si128(vs, acc_md); /* ... ? VS_clamped : 0x0000 */ vs = _mm_or_si128(vs, acc_lo); /* : acc_lo */ acc_md = _mm_slli_epi16(acc_md, 15); /* ... ? ^ 0x8000 : ^ 0x0000 */ - vs = _mm_xor_si128(vs, acc_md); /* Stupid unsigned-clamp-ish adjustment. */ - return (vs); + return _mm_xor_si128(vs, acc_md); /* stupid unsigned-clamp-ish adjustment */ #else word_32 product[N], addend[N]; register unsigned int i; @@ -696,7 +699,6 @@ VECTOR_OPERATION VMADN(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] += addend[i].UW >> 16; SIGNED_CLAMP_AL(V_result); - return; #endif } @@ -734,8 +736,7 @@ VECTOR_OPERATION VMADH(v16 vs, v16 vt) vs = *(v16 *)VACC_M; prod_high = _mm_unpackhi_epi16(vs, vt); vs = _mm_unpacklo_epi16(vs, vt); - vs = _mm_packs_epi32(vs, prod_high); - return (vs); + return _mm_packs_epi32(vs, prod_high); #else word_32 product[N], addend[N]; register unsigned int i; @@ -749,6 +750,5 @@ VECTOR_OPERATION VMADH(v16 vs, v16 vt) for (i = 0; i < N; i++) VACC_H[i] += (addend[i].UW >> 16) + (product[i].SW >> 16); SIGNED_CLAMP_AM(V_result); - return; #endif } diff --git a/vu/select.c b/vu/select.c index a1c50925..3d290cee 100644 --- a/vu/select.c +++ b/vu/select.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: MSP Simulation Layer for Vector Unit Computational Test Selects * * Authors: Iconoclast * -* Release: 2015.01.30 * +* Release: 2018.03.18 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -29,7 +29,7 @@ */ static void merge(pi16 VD, pi16 cmp, pi16 pass, pi16 fail) { - register int i; + register unsigned int i; #if (0 != 0) /* Do not use this version yet, as it still does not vectorize to SSE2. */ for (i = 0; i < N; i++) @@ -49,7 +49,7 @@ INLINE static void do_lt(pi16 VD, pi16 VS, pi16 VT) { i16 cn[N]; i16 eq[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) eq[i] = (VS[i] == VT[i]); @@ -75,7 +75,7 @@ INLINE static void do_lt(pi16 VD, pi16 VS, pi16 VT) INLINE static void do_eq(pi16 VD, pi16 VS, pi16 VT) { - register int i; + register unsigned int i; for (i = 0; i < N; i++) cf_comp[i] = (VS[i] == VT[i]); @@ -98,7 +98,7 @@ INLINE static void do_eq(pi16 VD, pi16 VS, pi16 VT) INLINE static void do_ne(pi16 VD, pi16 VS, pi16 VT) { - register int i; + register unsigned int i; for (i = 0; i < N; i++) cf_comp[i] = (VS[i] != VT[i]); @@ -123,7 +123,7 @@ INLINE static void do_ge(pi16 VD, pi16 VS, pi16 VT) { i16 ce[N]; i16 eq[N]; - register int i; + register unsigned int i; for (i = 0; i < N; i++) eq[i] = (VS[i] == VT[i]); @@ -154,7 +154,7 @@ INLINE static void do_cl(pi16 VD, pi16 VS, pi16 VT) ALIGNED i16 gen[N], len[N], lz[N], uz[N], sn[N]; i16 diff[N]; i16 cmp[N]; - register int i; + register unsigned int i; vector_copy((pi16)VB, VS); vector_copy((pi16)VC, VT); @@ -230,7 +230,7 @@ INLINE static void do_ch(pi16 VD, pi16 VS, pi16 VT) i16 diff[N]; #endif i16 cch[N]; /* corner case hack: -(-32768) with undefined sign */ - register int i; + register unsigned int i; for (i = 0; i < N; i++) cch[i] = (VT[i] == -32768) ? ~0 : 0; /* -(-32768) might not be >= 0. */ @@ -297,7 +297,7 @@ INLINE static void do_cr(pi16 VD, pi16 VS, pi16 VT) ALIGNED i16 ge[N], le[N], sn[N]; ALIGNED i16 VC[N]; i16 cmp[N]; - register int i; + register unsigned int i; vector_copy(VC, VT); for (i = 0; i < N; i++) diff --git a/vu/vu.c b/vu/vu.c index 1763a626..0af953a5 100644 --- a/vu/vu.c +++ b/vu/vu.c @@ -1,7 +1,7 @@ /******************************************************************************\ * Project: MSP Emulation Layer for Vector Unit Computational Operations * * Authors: Iconoclast * -* Release: 2016.03.23 * +* Release: 2018.03.18 * * License: CC0 Public Domain Dedication * * * * To the extent possible under law, the author(s) have dedicated all copyright * @@ -133,19 +133,20 @@ u16 get_VCC(void) } u8 get_VCE(void) { - int result; + unsigned int result; register u8 vce; result = 0x00 - | (cf_vce[07] << 0x7) - | (cf_vce[06] << 0x6) - | (cf_vce[05] << 0x5) - | (cf_vce[04] << 0x4) - | (cf_vce[03] << 0x3) - | (cf_vce[02] << 0x2) - | (cf_vce[01] << 0x1) - | (cf_vce[00] << 0x0); - vce = result & 0xFF; + | (cf_vce[0x7] << 0x7) + | (cf_vce[0x6] << 0x6) + | (cf_vce[0x5] << 0x5) + | (cf_vce[0x4] << 0x4) + | (cf_vce[0x3] << 0x3) + | (cf_vce[0x2] << 0x2) + | (cf_vce[0x1] << 0x1) + | (cf_vce[0x0] << 0x0) + ; + vce = (u8)(result & 0xFF); return (vce); /* Big endian becomes little. */ } #else @@ -207,7 +208,7 @@ u8 get_VCE(void) */ void set_VCO(u16 vco) { - register int i; + register unsigned int i; for (i = 0; i < N; i++) cf_co[i] = (vco >> (i + 0x0)) & 1; @@ -217,7 +218,7 @@ void set_VCO(u16 vco) } void set_VCC(u16 vcc) { - register int i; + register unsigned int i; for (i = 0; i < N; i++) cf_comp[i] = (vcc >> (i + 0x0)) & 1; @@ -227,7 +228,7 @@ void set_VCC(u16 vcc) } void set_VCE(u8 vce) { - register int i; + register unsigned int i; for (i = 0; i < N; i++) cf_vce[i] = (vce >> i) & 1;