diff --git a/.gitignore b/.gitignore index ce627b1f6..3c9ce5970 100644 --- a/.gitignore +++ b/.gitignore @@ -29,3 +29,12 @@ mingw32-config.cache */.dirstamp *.iml +aesb-arm.S.orig +aesb-x64.S.orig +aesb-x86.S.orig +scrypt-arm.S.orig +scrypt-x64.S.orig +scrypt-x86.S.orig +sha2-arm.S.orig +sha2-x64.S.orig +sha2-x86.S.orig diff --git a/Makefile.am b/Makefile.am index a44d14193..13d1e0ca5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -5,11 +5,16 @@ else JANSSON_INCLUDES= endif +OPENSSL_INCLUDE = -I/usr/local/opt/openssl/include +OPENSSL_LIB = -L/usr/local/opt/openssl/lib + EXTRA_DIST = example-cfg.json nomacro.pl +ACLOCAL_AMFLAGS = -I m4 + SUBDIRS = compat -INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) +INCLUDES = $(PTHREAD_FLAGS) -fno-strict-aliasing $(JANSSON_INCLUDES) $(OPENSSL_INCLUDE) bin_PROGRAMS = minerd @@ -70,7 +75,7 @@ minerd_SOURCES += sha2-arm.S scrypt-arm.S aesb-arm.S crypto/aesb-x86-impl.c endif endif -minerd_LDFLAGS = $(PTHREAD_FLAGS) +minerd_LDFLAGS = $(PTHREAD_FLAGS) $(OPENSSL_LIB) minerd_LDADD = @LIBCURL@ @JANSSON_LIBS@ @PTHREAD_LIBS@ @WS2_LIBS@ minerd_CPPFLAGS = @LIBCURL_CPPFLAGS@ minerd_CFLAGS = -Ofast -flto -fuse-linker-plugin diff --git a/autogen.sh b/autogen.sh index 989604a9b..e33319418 100755 --- a/autogen.sh +++ b/autogen.sh @@ -5,7 +5,7 @@ set -e -aclocal +aclocal -I m4 autoheader automake --gnu --add-missing --copy autoconf diff --git a/fresh.c b/fresh.c index 8193cabc1..90c2ee5b7 100644 --- a/fresh.c +++ b/fresh.c @@ -11,7 +11,7 @@ //#define DEBUG_ALGO -inline void freshhash(void* output, const void* input, uint32_t len) +static inline void freshhash(void* output, const void* input, uint32_t len) { unsigned char hash[128]; // uint32_t hashA[16], hashB[16]; #define hashA hash diff --git a/m4/libcurl.m4 b/m4/libcurl.m4 new file mode 100644 index 000000000..53d694d0a --- /dev/null +++ b/m4/libcurl.m4 @@ -0,0 +1,272 @@ +#*************************************************************************** +# _ _ ____ _ +# Project ___| | | | _ \| | +# / __| | | | |_) | | +# | (__| |_| | _ <| |___ +# \___|\___/|_| \_\_____| +# +# Copyright (C) 2006, David Shaw +# +# This software is licensed as described in the file COPYING, which +# you should have received as part of this distribution. The terms +# are also available at https://curl.haxx.se/docs/copyright.html. +# +# You may opt to use, copy, modify, merge, publish, distribute and/or sell +# copies of the Software, and permit persons to whom the Software is +# furnished to do so, under the terms of the COPYING file. +# +# This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY +# KIND, either express or implied. +# +########################################################################### +# LIBCURL_CHECK_CONFIG ([DEFAULT-ACTION], [MINIMUM-VERSION], +# [ACTION-IF-YES], [ACTION-IF-NO]) +# ---------------------------------------------------------- +# David Shaw May-09-2006 +# +# Checks for libcurl. DEFAULT-ACTION is the string yes or no to +# specify whether to default to --with-libcurl or --without-libcurl. +# If not supplied, DEFAULT-ACTION is yes. MINIMUM-VERSION is the +# minimum version of libcurl to accept. Pass the version as a regular +# version number like 7.10.1. If not supplied, any version is +# accepted. ACTION-IF-YES is a list of shell commands to run if +# libcurl was successfully found and passed the various tests. +# ACTION-IF-NO is a list of shell commands that are run otherwise. +# Note that using --without-libcurl does run ACTION-IF-NO. +# +# This macro #defines HAVE_LIBCURL if a working libcurl setup is +# found, and sets @LIBCURL@ and @LIBCURL_CPPFLAGS@ to the necessary +# values. Other useful defines are LIBCURL_FEATURE_xxx where xxx are +# the various features supported by libcurl, and LIBCURL_PROTOCOL_yyy +# where yyy are the various protocols supported by libcurl. Both xxx +# and yyy are capitalized. See the list of AH_TEMPLATEs at the top of +# the macro for the complete list of possible defines. Shell +# variables $libcurl_feature_xxx and $libcurl_protocol_yyy are also +# defined to 'yes' for those features and protocols that were found. +# Note that xxx and yyy keep the same capitalization as in the +# curl-config list (e.g. it's "HTTP" and not "http"). +# +# Users may override the detected values by doing something like: +# LIBCURL="-lcurl" LIBCURL_CPPFLAGS="-I/usr/myinclude" ./configure +# +# For the sake of sanity, this macro assumes that any libcurl that is +# found is after version 7.7.2, the first version that included the +# curl-config script. Note that it is very important for people +# packaging binary versions of libcurl to include this script! +# Without curl-config, we can only guess what protocols are available, +# or use curl_version_info to figure it out at runtime. + +AC_DEFUN([LIBCURL_CHECK_CONFIG], +[ + AH_TEMPLATE([LIBCURL_FEATURE_SSL],[Defined if libcurl supports SSL]) + AH_TEMPLATE([LIBCURL_FEATURE_KRB4],[Defined if libcurl supports KRB4]) + AH_TEMPLATE([LIBCURL_FEATURE_IPV6],[Defined if libcurl supports IPv6]) + AH_TEMPLATE([LIBCURL_FEATURE_LIBZ],[Defined if libcurl supports libz]) + AH_TEMPLATE([LIBCURL_FEATURE_ASYNCHDNS],[Defined if libcurl supports AsynchDNS]) + AH_TEMPLATE([LIBCURL_FEATURE_IDN],[Defined if libcurl supports IDN]) + AH_TEMPLATE([LIBCURL_FEATURE_SSPI],[Defined if libcurl supports SSPI]) + AH_TEMPLATE([LIBCURL_FEATURE_NTLM],[Defined if libcurl supports NTLM]) + + AH_TEMPLATE([LIBCURL_PROTOCOL_HTTP],[Defined if libcurl supports HTTP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_HTTPS],[Defined if libcurl supports HTTPS]) + AH_TEMPLATE([LIBCURL_PROTOCOL_FTP],[Defined if libcurl supports FTP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_FTPS],[Defined if libcurl supports FTPS]) + AH_TEMPLATE([LIBCURL_PROTOCOL_FILE],[Defined if libcurl supports FILE]) + AH_TEMPLATE([LIBCURL_PROTOCOL_TELNET],[Defined if libcurl supports TELNET]) + AH_TEMPLATE([LIBCURL_PROTOCOL_LDAP],[Defined if libcurl supports LDAP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_DICT],[Defined if libcurl supports DICT]) + AH_TEMPLATE([LIBCURL_PROTOCOL_TFTP],[Defined if libcurl supports TFTP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_RTSP],[Defined if libcurl supports RTSP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_POP3],[Defined if libcurl supports POP3]) + AH_TEMPLATE([LIBCURL_PROTOCOL_IMAP],[Defined if libcurl supports IMAP]) + AH_TEMPLATE([LIBCURL_PROTOCOL_SMTP],[Defined if libcurl supports SMTP]) + + AC_ARG_WITH(libcurl, + AS_HELP_STRING([--with-libcurl=PREFIX],[look for the curl library in PREFIX/lib and headers in PREFIX/include]), + [_libcurl_with=$withval],[_libcurl_with=ifelse([$1],,[yes],[$1])]) + + if test "$_libcurl_with" != "no" ; then + + AC_PROG_AWK + + _libcurl_version_parse="eval $AWK '{split(\$NF,A,\".\"); X=256*256*A[[1]]+256*A[[2]]+A[[3]]; print X;}'" + + _libcurl_try_link=yes + + if test -d "$_libcurl_with" ; then + LIBCURL_CPPFLAGS="-I$withval/include" + _libcurl_ldflags="-L$withval/lib" + AC_PATH_PROG([_libcurl_config],[curl-config],[], + ["$withval/bin"]) + else + AC_PATH_PROG([_libcurl_config],[curl-config],[],[$PATH]) + fi + + if test x$_libcurl_config != "x" ; then + AC_CACHE_CHECK([for the version of libcurl], + [libcurl_cv_lib_curl_version], + [libcurl_cv_lib_curl_version=`$_libcurl_config --version | $AWK '{print $[]2}'`]) + + _libcurl_version=`echo $libcurl_cv_lib_curl_version | $_libcurl_version_parse` + _libcurl_wanted=`echo ifelse([$2],,[0],[$2]) | $_libcurl_version_parse` + + if test $_libcurl_wanted -gt 0 ; then + AC_CACHE_CHECK([for libcurl >= version $2], + [libcurl_cv_lib_version_ok], + [ + if test $_libcurl_version -ge $_libcurl_wanted ; then + libcurl_cv_lib_version_ok=yes + else + libcurl_cv_lib_version_ok=no + fi + ]) + fi + + if test $_libcurl_wanted -eq 0 || test x$libcurl_cv_lib_version_ok = xyes ; then + if test x"$LIBCURL_CPPFLAGS" = "x" ; then + LIBCURL_CPPFLAGS=`$_libcurl_config --cflags` + fi + if test x"$LIBCURL" = "x" ; then + LIBCURL=`$_libcurl_config --libs` + + # This is so silly, but Apple actually has a bug in their + # curl-config script. Fixed in Tiger, but there are still + # lots of Panther installs around. + case "${host}" in + powerpc-apple-darwin7*) + LIBCURL=`echo $LIBCURL | sed -e 's|-arch i386||g'` + ;; + esac + fi + + # All curl-config scripts support --feature + _libcurl_features=`$_libcurl_config --feature` + + # Is it modern enough to have --protocols? (7.12.4) + if test $_libcurl_version -ge 461828 ; then + _libcurl_protocols=`$_libcurl_config --protocols` + fi + else + _libcurl_try_link=no + fi + + unset _libcurl_wanted + fi + + if test $_libcurl_try_link = yes ; then + + # we didn't find curl-config, so let's see if the user-supplied + # link line (or failing that, "-lcurl") is enough. + LIBCURL=${LIBCURL-"$_libcurl_ldflags -lcurl"} + + AC_CACHE_CHECK([whether libcurl is usable], + [libcurl_cv_lib_curl_usable], + [ + _libcurl_save_cppflags=$CPPFLAGS + CPPFLAGS="$LIBCURL_CPPFLAGS $CPPFLAGS" + _libcurl_save_libs=$LIBS + LIBS="$LIBCURL $LIBS" + + AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]],[[ +/* Try and use a few common options to force a failure if we are + missing symbols or can't link. */ +int x; +curl_easy_setopt(NULL,CURLOPT_URL,NULL); +x=CURL_ERROR_SIZE; +x=CURLOPT_WRITEFUNCTION; +x=CURLOPT_WRITEDATA; +x=CURLOPT_ERRORBUFFER; +x=CURLOPT_STDERR; +x=CURLOPT_VERBOSE; +if (x) {;} +]])],libcurl_cv_lib_curl_usable=yes,libcurl_cv_lib_curl_usable=no) + + CPPFLAGS=$_libcurl_save_cppflags + LIBS=$_libcurl_save_libs + unset _libcurl_save_cppflags + unset _libcurl_save_libs + ]) + + if test $libcurl_cv_lib_curl_usable = yes ; then + + # Does curl_free() exist in this version of libcurl? + # If not, fake it with free() + + _libcurl_save_cppflags=$CPPFLAGS + CPPFLAGS="$CPPFLAGS $LIBCURL_CPPFLAGS" + _libcurl_save_libs=$LIBS + LIBS="$LIBS $LIBCURL" + + AC_CHECK_FUNC(curl_free,, + AC_DEFINE(curl_free,free, + [Define curl_free() as free() if our version of curl lacks curl_free.])) + + CPPFLAGS=$_libcurl_save_cppflags + LIBS=$_libcurl_save_libs + unset _libcurl_save_cppflags + unset _libcurl_save_libs + + AC_DEFINE(HAVE_LIBCURL,1, + [Define to 1 if you have a functional curl library.]) + AC_SUBST(LIBCURL_CPPFLAGS) + AC_SUBST(LIBCURL) + + for _libcurl_feature in $_libcurl_features ; do + AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_feature_$_libcurl_feature),[1]) + eval AS_TR_SH(libcurl_feature_$_libcurl_feature)=yes + done + + if test "x$_libcurl_protocols" = "x" ; then + + # We don't have --protocols, so just assume that all + # protocols are available + _libcurl_protocols="HTTP FTP FILE TELNET LDAP DICT TFTP" + + if test x$libcurl_feature_SSL = xyes ; then + _libcurl_protocols="$_libcurl_protocols HTTPS" + + # FTPS wasn't standards-compliant until version + # 7.11.0 (0x070b00 == 461568) + if test $_libcurl_version -ge 461568; then + _libcurl_protocols="$_libcurl_protocols FTPS" + fi + fi + + # RTSP, IMAP, POP3 and SMTP were added in + # 7.20.0 (0x071400 == 463872) + if test $_libcurl_version -ge 463872; then + _libcurl_protocols="$_libcurl_protocols RTSP IMAP POP3 SMTP" + fi + fi + + for _libcurl_protocol in $_libcurl_protocols ; do + AC_DEFINE_UNQUOTED(AS_TR_CPP(libcurl_protocol_$_libcurl_protocol),[1]) + eval AS_TR_SH(libcurl_protocol_$_libcurl_protocol)=yes + done + else + unset LIBCURL + unset LIBCURL_CPPFLAGS + fi + fi + + unset _libcurl_try_link + unset _libcurl_version_parse + unset _libcurl_config + unset _libcurl_feature + unset _libcurl_features + unset _libcurl_protocol + unset _libcurl_protocols + unset _libcurl_version + unset _libcurl_ldflags + fi + + if test x$_libcurl_with = xno || test x$libcurl_cv_lib_curl_usable != xyes ; then + # This is the IF-NO path + ifelse([$4],,:,[$4]) + else + # This is the IF-YES path + ifelse([$3],,:,[$3]) + fi + + unset _libcurl_with +])dnl diff --git a/nomacro.pl b/nomacro.pl old mode 100644 new mode 100755 diff --git a/scrypt-arm.S b/scrypt-arm.S index 5be3b0e9d..dd533d52f 100644 --- a/scrypt-arm.S +++ b/scrypt-arm.S @@ -27,437 +27,406 @@ #ifdef __ARM_ARCH_5E_OR_6__ -.macro scrypt_shuffle - add lr, r0, #9*4 - ldmia r0, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #5*4] - str r5, [r0, #15*4] - str r6, [r0, #12*4] - str r7, [r0, #1*4] - ldr r5, [r0, #7*4] - str r2, [r0, #13*4] - str r8, [r0, #2*4] - strd r4, [r0, #10*4] - str r9, [r0, #7*4] - str r10, [r0, #4*4] - str r11, [r0, #9*4] - str lr, [r0, #3*4] - - add r2, r0, #64+0*4 - add lr, r0, #64+9*4 - ldmia r2, {r2-r7} - ldmia lr, {r2, r8-r12, lr} - str r3, [r0, #64+5*4] - str r5, [r0, #64+15*4] - str r6, [r0, #64+12*4] - str r7, [r0, #64+1*4] - ldr r5, [r0, #64+7*4] - str r2, [r0, #64+13*4] - str r8, [r0, #64+2*4] - strd r4, [r0, #64+10*4] - str r9, [r0, #64+7*4] - str r10, [r0, #64+4*4] - str r11, [r0, #64+9*4] - str lr, [r0, #64+3*4] -.endm +#define scrypt_shuffle() \ + add lr, r0, #9*4; \ + ldmia r0, {r2-r7}; \ + ldmia lr, {r2, r8-r12, lr}; \ + str r3, [r0, #5*4]; \ + str r5, [r0, #15*4]; \ + str r6, [r0, #12*4]; \ + str r7, [r0, #1*4]; \ + ldr r5, [r0, #7*4]; \ + str r2, [r0, #13*4]; \ + str r8, [r0, #2*4]; \ + strd r4, [r0, #10*4]; \ + str r9, [r0, #7*4]; \ + str r10, [r0, #4*4]; \ + str r11, [r0, #9*4]; \ + str lr, [r0, #3*4]; \ + add r2, r0, #64+0*4; \ + add lr, r0, #64+9*4; \ + ldmia r2, {r2-r7}; \ + ldmia lr, {r2, r8-r12, lr}; \ + str r3, [r0, #64+5*4]; \ + str r5, [r0, #64+15*4]; \ + str r6, [r0, #64+12*4]; \ + str r7, [r0, #64+1*4]; \ + ldr r5, [r0, #64+7*4]; \ + str r2, [r0, #64+13*4]; \ + str r8, [r0, #64+2*4]; \ + strd r4, [r0, #64+10*4]; \ + str r9, [r0, #64+7*4]; \ + str r10, [r0, #64+4*4]; \ + str r11, [r0, #64+9*4]; \ + str lr, [r0, #64+3*4]; \ -.macro salsa8_core_doubleround_body - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #25 - add r6, r0, r4 - eor r11, r11, r7, ror #25 - add r7, r1, r5 - strd r10, [sp, #14*4] - eor r12, r12, r6, ror #25 - eor lr, lr, r7, ror #25 - - ldrd r6, [sp, #10*4] - add r2, r10, r2 - add r3, r11, r3 - eor r6, r6, r2, ror #23 - add r2, r12, r0 - eor r7, r7, r3, ror #23 - add r3, lr, r1 - strd r6, [sp, #10*4] - eor r8, r8, r2, ror #23 - eor r9, r9, r3, ror #23 - - ldrd r2, [sp, #6*4] - add r10, r6, r10 - add r11, r7, r11 - eor r2, r2, r10, ror #19 - add r10, r8, r12 - eor r3, r3, r11, ror #19 - add r11, r9, lr - eor r4, r4, r10, ror #19 - eor r5, r5, r11, ror #19 - - ldrd r10, [sp, #2*4] - add r6, r2, r6 - add r7, r3, r7 - eor r10, r10, r6, ror #14 - add r6, r4, r8 - eor r11, r11, r7, ror #14 - add r7, r5, r9 - eor r0, r0, r6, ror #14 - eor r1, r1, r7, ror #14 - - - ldrd r6, [sp, #14*4] - strd r2, [sp, #6*4] - strd r10, [sp, #2*4] - add r6, r11, r6 - add r7, r0, r7 - eor r4, r4, r6, ror #25 - add r6, r1, r12 - eor r5, r5, r7, ror #25 - add r7, r10, lr - eor r2, r2, r6, ror #25 - eor r3, r3, r7, ror #25 - strd r2, [sp, #6*4] - - add r10, r3, r10 - ldrd r6, [sp, #10*4] - add r11, r4, r11 - eor r8, r8, r10, ror #23 - add r10, r5, r0 - eor r9, r9, r11, ror #23 - add r11, r2, r1 - eor r6, r6, r10, ror #23 - eor r7, r7, r11, ror #23 - strd r6, [sp, #10*4] - - add r2, r7, r2 - ldrd r10, [sp, #14*4] - add r3, r8, r3 - eor r12, r12, r2, ror #19 - add r2, r9, r4 - eor lr, lr, r3, ror #19 - add r3, r6, r5 - eor r10, r10, r2, ror #19 - eor r11, r11, r3, ror #19 - - ldrd r2, [sp, #2*4] - add r6, r11, r6 - add r7, r12, r7 - eor r0, r0, r6, ror #14 - add r6, lr, r8 - eor r1, r1, r7, ror #14 - add r7, r10, r9 - eor r2, r2, r6, ror #14 - eor r3, r3, r7, ror #14 -.endm -.macro salsa8_core - ldmia sp, {r0-r12, lr} - - ldrd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - ldrd r6, [sp, #6*4] - strd r2, [sp, #2*4] - strd r10, [sp, #14*4] - salsa8_core_doubleround_body - - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] - strd r10, [sp, #14*4] -.endm +#define salsa8_core_doubleround_body() \ + add r6, r2, r6; \ + add r7, r3, r7; \ + eor r10, r10, r6, ror #25; \ + add r6, r0, r4; \ + eor r11, r11, r7, ror #25; \ + add r7, r1, r5; \ + strd r10, [sp, #14*4]; \ + eor r12, r12, r6, ror #25; \ + eor lr, lr, r7, ror #25; \ + ldrd r6, [sp, #10*4]; \ + add r2, r10, r2; \ + add r3, r11, r3; \ + eor r6, r6, r2, ror #23; \ + add r2, r12, r0; \ + eor r7, r7, r3, ror #23; \ + add r3, lr, r1; \ + strd r6, [sp, #10*4]; \ + eor r8, r8, r2, ror #23; \ + eor r9, r9, r3, ror #23; \ + ldrd r2, [sp, #6*4]; \ + add r10, r6, r10; \ + add r11, r7, r11; \ + eor r2, r2, r10, ror #19; \ + add r10, r8, r12; \ + eor r3, r3, r11, ror #19; \ + add r11, r9, lr; \ + eor r4, r4, r10, ror #19; \ + eor r5, r5, r11, ror #19; \ + ldrd r10, [sp, #2*4]; \ + add r6, r2, r6; \ + add r7, r3, r7; \ + eor r10, r10, r6, ror #14; \ + add r6, r4, r8; \ + eor r11, r11, r7, ror #14; \ + add r7, r5, r9; \ + eor r0, r0, r6, ror #14; \ + eor r1, r1, r7, ror #14; \ + ldrd r6, [sp, #14*4]; \ + strd r2, [sp, #6*4]; \ + strd r10, [sp, #2*4]; \ + add r6, r11, r6; \ + add r7, r0, r7; \ + eor r4, r4, r6, ror #25; \ + add r6, r1, r12; \ + eor r5, r5, r7, ror #25; \ + add r7, r10, lr; \ + eor r2, r2, r6, ror #25; \ + eor r3, r3, r7, ror #25; \ + strd r2, [sp, #6*4]; \ + add r10, r3, r10; \ + ldrd r6, [sp, #10*4]; \ + add r11, r4, r11; \ + eor r8, r8, r10, ror #23; \ + add r10, r5, r0; \ + eor r9, r9, r11, ror #23; \ + add r11, r2, r1; \ + eor r6, r6, r10, ror #23; \ + eor r7, r7, r11, ror #23; \ + strd r6, [sp, #10*4]; \ + add r2, r7, r2; \ + ldrd r10, [sp, #14*4]; \ + add r3, r8, r3; \ + eor r12, r12, r2, ror #19; \ + add r2, r9, r4; \ + eor lr, lr, r3, ror #19; \ + add r3, r6, r5; \ + eor r10, r10, r2, ror #19; \ + eor r11, r11, r3, ror #19; \ + ldrd r2, [sp, #2*4]; \ + add r6, r11, r6; \ + add r7, r12, r7; \ + eor r0, r0, r6, ror #14; \ + add r6, lr, r8; \ + eor r1, r1, r7, ror #14; \ + add r7, r10, r9; \ + eor r2, r2, r6, ror #14; \ + eor r3, r3, r7, ror #14; \ + + +#define salsa8_core() \ + ldmia sp, {r0-r12, lr}; \ + ldrd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + ldrd r6, [sp, #6*4]; \ + strd r2, [sp, #2*4]; \ + strd r10, [sp, #14*4]; \ + salsa8_core_doubleround_body(); \ + stmia sp, {r0-r5}; \ + strd r8, [sp, #8*4]; \ + str r12, [sp, #12*4]; \ + str lr, [sp, #13*4]; \ + strd r10, [sp, #14*4]; \ + #else -.macro scrypt_shuffle -.endm +#define scrypt_shuffle() \ -.macro salsa8_core_doubleround_body - ldr r8, [sp, #8*4] - add r11, r11, r10 - ldr lr, [sp, #13*4] - add r12, r12, r3 - eor r2, r2, r11, ror #23 - add r11, r4, r0 - eor r7, r7, r12, ror #23 - add r12, r9, r5 - str r9, [sp, #9*4] - eor r8, r8, r11, ror #23 - str r10, [sp, #14*4] - eor lr, lr, r12, ror #23 - - ldr r11, [sp, #11*4] - add r9, lr, r9 - ldr r12, [sp, #12*4] - add r10, r2, r10 - eor r1, r1, r9, ror #19 - add r9, r7, r3 - eor r6, r6, r10, ror #19 - add r10, r8, r4 - str r8, [sp, #8*4] - eor r11, r11, r9, ror #19 - str lr, [sp, #13*4] - eor r12, r12, r10, ror #19 - - ldr r9, [sp, #10*4] - add r8, r12, r8 - ldr r10, [sp, #15*4] - add lr, r1, lr - eor r0, r0, r8, ror #14 - add r8, r6, r2 - eor r5, r5, lr, ror #14 - add lr, r11, r7 - eor r9, r9, r8, ror #14 - ldr r8, [sp, #9*4] - eor r10, r10, lr, ror #14 - ldr lr, [sp, #14*4] - - - add r8, r9, r8 - str r9, [sp, #10*4] - add lr, r10, lr - str r10, [sp, #15*4] - eor r11, r11, r8, ror #25 - add r8, r0, r3 - eor r12, r12, lr, ror #25 - add lr, r5, r4 - eor r1, r1, r8, ror #25 - ldr r8, [sp, #8*4] - eor r6, r6, lr, ror #25 - - add r9, r11, r9 - ldr lr, [sp, #13*4] - add r10, r12, r10 - eor r8, r8, r9, ror #23 - add r9, r1, r0 - eor lr, lr, r10, ror #23 - add r10, r6, r5 - str r11, [sp, #11*4] - eor r2, r2, r9, ror #23 - str r12, [sp, #12*4] - eor r7, r7, r10, ror #23 - - ldr r9, [sp, #9*4] - add r11, r8, r11 - ldr r10, [sp, #14*4] - add r12, lr, r12 - eor r9, r9, r11, ror #19 - add r11, r2, r1 - eor r10, r10, r12, ror #19 - add r12, r7, r6 - str r8, [sp, #8*4] - eor r3, r3, r11, ror #19 - str lr, [sp, #13*4] - eor r4, r4, r12, ror #19 -.endm -.macro salsa8_core - ldmia sp, {r0-r7} - - ldr r12, [sp, #15*4] - ldr r8, [sp, #11*4] - ldr lr, [sp, #12*4] - - ldr r9, [sp, #9*4] - add r8, r8, r12 - ldr r11, [sp, #10*4] - add lr, lr, r0 - eor r3, r3, r8, ror #25 - add r8, r5, r1 - ldr r10, [sp, #14*4] - eor r4, r4, lr, ror #25 - add lr, r11, r6 - eor r9, r9, r8, ror #25 - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - eor r11, r11, r8, ror #14 - add r8, r3, r2 - eor r12, r12, lr, ror #14 - add lr, r4, r7 - eor r0, r0, r8, ror #14 - ldr r8, [sp, #11*4] - eor r5, r5, lr, ror #14 - ldr lr, [sp, #12*4] - - add r8, r8, r12 - str r11, [sp, #10*4] - add lr, lr, r0 - str r12, [sp, #15*4] - eor r3, r3, r8, ror #25 - add r8, r5, r1 - eor r4, r4, lr, ror #25 - add lr, r11, r6 - str r9, [sp, #9*4] - eor r9, r9, r8, ror #25 - str r10, [sp, #14*4] - eor r10, r10, lr, ror #25 - - salsa8_core_doubleround_body - - ldr r11, [sp, #10*4] - add r8, r9, r8 - ldr r12, [sp, #15*4] - add lr, r10, lr - str r9, [sp, #9*4] - eor r11, r11, r8, ror #14 - eor r12, r12, lr, ror #14 - add r8, r3, r2 - str r10, [sp, #14*4] - add lr, r4, r7 - str r11, [sp, #10*4] - eor r0, r0, r8, ror #14 - str r12, [sp, #15*4] - eor r5, r5, lr, ror #14 - - stmia sp, {r0-r7} -.endm +#define salsa8_core_doubleround_body() \ + ldr r8, [sp, #8*4]; \ + add r11, r11, r10; \ + ldr lr, [sp, #13*4]; \ + add r12, r12, r3; \ + eor r2, r2, r11, ror #23; \ + add r11, r4, r0; \ + eor r7, r7, r12, ror #23; \ + add r12, r9, r5; \ + str r9, [sp, #9*4]; \ + eor r8, r8, r11, ror #23; \ + str r10, [sp, #14*4]; \ + eor lr, lr, r12, ror #23; \ + ldr r11, [sp, #11*4]; \ + add r9, lr, r9; \ + ldr r12, [sp, #12*4]; \ + add r10, r2, r10; \ + eor r1, r1, r9, ror #19; \ + add r9, r7, r3; \ + eor r6, r6, r10, ror #19; \ + add r10, r8, r4; \ + str r8, [sp, #8*4]; \ + eor r11, r11, r9, ror #19; \ + str lr, [sp, #13*4]; \ + eor r12, r12, r10, ror #19; \ + ldr r9, [sp, #10*4]; \ + add r8, r12, r8; \ + ldr r10, [sp, #15*4]; \ + add lr, r1, lr; \ + eor r0, r0, r8, ror #14; \ + add r8, r6, r2; \ + eor r5, r5, lr, ror #14; \ + add lr, r11, r7; \ + eor r9, r9, r8, ror #14; \ + ldr r8, [sp, #9*4]; \ + eor r10, r10, lr, ror #14; \ + ldr lr, [sp, #14*4]; \ + add r8, r9, r8; \ + str r9, [sp, #10*4]; \ + add lr, r10, lr; \ + str r10, [sp, #15*4]; \ + eor r11, r11, r8, ror #25; \ + add r8, r0, r3; \ + eor r12, r12, lr, ror #25; \ + add lr, r5, r4; \ + eor r1, r1, r8, ror #25; \ + ldr r8, [sp, #8*4]; \ + eor r6, r6, lr, ror #25; \ + add r9, r11, r9; \ + ldr lr, [sp, #13*4]; \ + add r10, r12, r10; \ + eor r8, r8, r9, ror #23; \ + add r9, r1, r0; \ + eor lr, lr, r10, ror #23; \ + add r10, r6, r5; \ + str r11, [sp, #11*4]; \ + eor r2, r2, r9, ror #23; \ + str r12, [sp, #12*4]; \ + eor r7, r7, r10, ror #23; \ + ldr r9, [sp, #9*4]; \ + add r11, r8, r11; \ + ldr r10, [sp, #14*4]; \ + add r12, lr, r12; \ + eor r9, r9, r11, ror #19; \ + add r11, r2, r1; \ + eor r10, r10, r12, ror #19; \ + add r12, r7, r6; \ + str r8, [sp, #8*4]; \ + eor r3, r3, r11, ror #19; \ + str lr, [sp, #13*4]; \ + eor r4, r4, r12, ror #19; \ + + +#define salsa8_core() \ + ldmia sp, {r0-r7}; \ + ldr r12, [sp, #15*4]; \ + ldr r8, [sp, #11*4]; \ + ldr lr, [sp, #12*4]; \ + ldr r9, [sp, #9*4]; \ + add r8, r8, r12; \ + ldr r11, [sp, #10*4]; \ + add lr, lr, r0; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + ldr r10, [sp, #14*4]; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + eor r9, r9, r8, ror #25; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + eor r11, r11, r8, ror #14; \ + add r8, r3, r2; \ + eor r12, r12, lr, ror #14; \ + add lr, r4, r7; \ + eor r0, r0, r8, ror #14; \ + ldr r8, [sp, #11*4]; \ + eor r5, r5, lr, ror #14; \ + ldr lr, [sp, #12*4]; \ + add r8, r8, r12; \ + str r11, [sp, #10*4]; \ + add lr, lr, r0; \ + str r12, [sp, #15*4]; \ + eor r3, r3, r8, ror #25; \ + add r8, r5, r1; \ + eor r4, r4, lr, ror #25; \ + add lr, r11, r6; \ + str r9, [sp, #9*4]; \ + eor r9, r9, r8, ror #25; \ + str r10, [sp, #14*4]; \ + eor r10, r10, lr, ror #25; \ + salsa8_core_doubleround_body(); \ + ldr r11, [sp, #10*4]; \ + add r8, r9, r8; \ + ldr r12, [sp, #15*4]; \ + add lr, r10, lr; \ + str r9, [sp, #9*4]; \ + eor r11, r11, r8, ror #14; \ + eor r12, r12, lr, ror #14; \ + add r8, r3, r2; \ + str r10, [sp, #14*4]; \ + add lr, r4, r7; \ + str r11, [sp, #10*4]; \ + eor r0, r0, r8, ror #14; \ + str r12, [sp, #15*4]; \ + eor r5, r5, lr, ror #14; \ + stmia sp, {r0-r7}; \ + #endif -.macro scrypt_core_macro1a_x4 - ldmia r0, {r4-r7} - ldmia lr!, {r8-r11} - stmia r1!, {r4-r7} - stmia r3!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm +#define scrypt_core_macro1a_x4() \ + ldmia r0, {r4-r7}; \ + ldmia lr!, {r8-r11}; \ + stmia r1!, {r4-r7}; \ + stmia r3!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + + +#define scrypt_core_macro1b_x4() \ + ldmia r3!, {r8-r11}; \ + ldmia r2, {r4-r7}; \ + eor r8, r8, r4; \ + eor r9, r9, r5; \ + eor r10, r10, r6; \ + eor r11, r11, r7; \ + ldmia r0, {r4-r7}; \ + stmia r2!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + ldmia r1!, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + + +#define scrypt_core_macro2_x4() \ + ldmia r12, {r4-r7}; \ + ldmia r0, {r8-r11}; \ + add r4, r4, r8; \ + add r5, r5, r9; \ + add r6, r6, r10; \ + add r7, r7, r11; \ + stmia r0!, {r4-r7}; \ + ldmia r2, {r8-r11}; \ + eor r4, r4, r8; \ + eor r5, r5, r9; \ + eor r6, r6, r10; \ + eor r7, r7, r11; \ + stmia r2!, {r4-r7}; \ + stmia r12!, {r4-r7}; \ + -.macro scrypt_core_macro1b_x4 - ldmia r3!, {r8-r11} - ldmia r2, {r4-r7} - eor r8, r8, r4 - eor r9, r9, r5 - eor r10, r10, r6 - eor r11, r11, r7 - ldmia r0, {r4-r7} - stmia r2!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - ldmia r1!, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r0!, {r4-r7} - stmia r12!, {r4-r7} -.endm +#define scrypt_core_macro3_x4() \ + ldmia r1!, {r4-r7}; \ + ldmia r0, {r8-r11}; \ + add r4, r4, r8; \ + add r5, r5, r9; \ + add r6, r6, r10; \ + add r7, r7, r11; \ + stmia r0!, {r4-r7}; \ -.macro scrypt_core_macro2_x4 - ldmia r12, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} - ldmia r2, {r8-r11} - eor r4, r4, r8 - eor r5, r5, r9 - eor r6, r6, r10 - eor r7, r7, r11 - stmia r2!, {r4-r7} - stmia r12!, {r4-r7} -.endm -.macro scrypt_core_macro3_x4 - ldmia r1!, {r4-r7} - ldmia r0, {r8-r11} - add r4, r4, r8 - add r5, r5, r9 - add r6, r6, r10 - add r7, r7, r11 - stmia r0!, {r4-r7} -.endm +#define scrypt_core_macro3_x6() \ + ldmia r1!, {r2-r7}; \ + ldmia r0, {r8-r12, lr}; \ + add r2, r2, r8; \ + add r3, r3, r9; \ + add r4, r4, r10; \ + add r5, r5, r11; \ + add r6, r6, r12; \ + add r7, r7, lr; \ + stmia r0!, {r2-r7}; \ -.macro scrypt_core_macro3_x6 - ldmia r1!, {r2-r7} - ldmia r0, {r8-r12, lr} - add r2, r2, r8 - add r3, r3, r9 - add r4, r4, r10 - add r5, r5, r11 - add r6, r6, r12 - add r7, r7, lr - stmia r0!, {r2-r7} -.endm .text @@ -477,7 +446,7 @@ _scrypt_core: str r12, [sp, #20*4] str r2, [sp, #21*4] - scrypt_shuffle + scrypt_shuffle() ldr r2, [sp, #21*4] str r0, [sp, #16*4] @@ -487,32 +456,32 @@ scrypt_core_loop1: add lr, r0, #16*4 add r3, r1, #16*4 mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() str r1, [sp, #17*4] - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r1, sp add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() ldr r3, [sp, #17*4] ldr r12, [sp, #18*4] - scrypt_core_macro3_x4 + scrypt_core_macro3_x4() add r1, r3, #16*4 sub r0, r0, #32*4 @@ -536,29 +505,29 @@ scrypt_core_loop2: pld [r1, #24*4] pld [r1, #8*4] #endif - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() - salsa8_core + salsa8_core() ldr r0, [sp, #16*4] mov r1, sp ldr r3, [sp, #17*4] add r0, r0, #16*4 ldr r2, [sp, #21*4] - scrypt_core_macro3_x4 + scrypt_core_macro3_x4() and r4, r4, r2 add r3, r3, r4, lsl #7 str r3, [sp, #19*4] @@ -566,8 +535,8 @@ scrypt_core_loop2: pld [r3, #16*4] pld [r3] #endif - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() ldr r12, [sp, #18*4] sub r0, r0, #32*4 @@ -575,7 +544,7 @@ scrypt_core_loop2: subs r12, r12, #1 bne scrypt_core_loop2 - scrypt_shuffle + scrypt_shuffle() ldr sp, [sp, #20*4] #ifdef __thumb__ @@ -588,201 +557,193 @@ scrypt_core_loop2: #ifdef __ARM_NEON__ -.macro salsa8_core_3way_doubleround - ldrd r6, [sp, #6*4] - vadd.u32 q4, q0, q1 - add r6, r2, r6 - vadd.u32 q6, q8, q9 - add r7, r3, r7 - vshl.u32 q5, q4, #7 - eor r10, r10, r6, ror #25 - vshl.u32 q7, q6, #7 - add r6, r0, r4 - vshr.u32 q4, q4, #32-7 - eor r11, r11, r7, ror #25 - vshr.u32 q6, q6, #32-7 - add r7, r1, r5 - veor.u32 q3, q3, q5 - strd r10, [sp, #14*4] - veor.u32 q11, q11, q7 - eor r12, r12, r6, ror #25 - veor.u32 q3, q3, q4 - eor lr, lr, r7, ror #25 - veor.u32 q11, q11, q6 - - ldrd r6, [sp, #10*4] - vadd.u32 q4, q3, q0 - add r2, r10, r2 - vadd.u32 q6, q11, q8 - add r3, r11, r3 - vshl.u32 q5, q4, #9 - eor r6, r6, r2, ror #23 - vshl.u32 q7, q6, #9 - add r2, r12, r0 - vshr.u32 q4, q4, #32-9 - eor r7, r7, r3, ror #23 - vshr.u32 q6, q6, #32-9 - add r3, lr, r1 - veor.u32 q2, q2, q5 - strd r6, [sp, #10*4] - veor.u32 q10, q10, q7 - eor r8, r8, r2, ror #23 - veor.u32 q2, q2, q4 - eor r9, r9, r3, ror #23 - veor.u32 q10, q10, q6 - - ldrd r2, [sp, #6*4] - vadd.u32 q4, q2, q3 - add r10, r6, r10 - vadd.u32 q6, q10, q11 - add r11, r7, r11 - vext.u32 q3, q3, q3, #3 - eor r2, r2, r10, ror #19 - vshl.u32 q5, q4, #13 - add r10, r8, r12 - vext.u32 q11, q11, q11, #3 - eor r3, r3, r11, ror #19 - vshl.u32 q7, q6, #13 - add r11, r9, lr - vshr.u32 q4, q4, #32-13 - eor r4, r4, r10, ror #19 - vshr.u32 q6, q6, #32-13 - eor r5, r5, r11, ror #19 - veor.u32 q1, q1, q5 - veor.u32 q9, q9, q7 - veor.u32 q1, q1, q4 - veor.u32 q9, q9, q6 - - ldrd r10, [sp, #2*4] - vadd.u32 q4, q1, q2 - add r6, r2, r6 - vadd.u32 q6, q9, q10 - add r7, r3, r7 - vswp.u32 d4, d5 - eor r10, r10, r6, ror #14 - vshl.u32 q5, q4, #18 - add r6, r4, r8 - vswp.u32 d20, d21 - eor r11, r11, r7, ror #14 - vshl.u32 q7, q6, #18 - add r7, r5, r9 - vshr.u32 q4, q4, #32-18 - eor r0, r0, r6, ror #14 - vshr.u32 q6, q6, #32-18 - eor r1, r1, r7, ror #14 - veor.u32 q0, q0, q5 - ldrd r6, [sp, #14*4] - veor.u32 q8, q8, q7 - veor.u32 q0, q0, q4 - veor.u32 q8, q8, q6 - - - strd r2, [sp, #6*4] - vadd.u32 q4, q0, q3 - strd r10, [sp, #2*4] - vadd.u32 q6, q8, q11 - add r6, r11, r6 - vext.u32 q1, q1, q1, #1 - add r7, r0, r7 - vshl.u32 q5, q4, #7 - eor r4, r4, r6, ror #25 - vext.u32 q9, q9, q9, #1 - add r6, r1, r12 - vshl.u32 q7, q6, #7 - eor r5, r5, r7, ror #25 - vshr.u32 q4, q4, #32-7 - add r7, r10, lr - vshr.u32 q6, q6, #32-7 - eor r2, r2, r6, ror #25 - veor.u32 q1, q1, q5 - eor r3, r3, r7, ror #25 - veor.u32 q9, q9, q7 - strd r2, [sp, #6*4] - veor.u32 q1, q1, q4 - veor.u32 q9, q9, q6 - - add r10, r3, r10 - vadd.u32 q4, q1, q0 - ldrd r6, [sp, #10*4] - vadd.u32 q6, q9, q8 - add r11, r4, r11 - vshl.u32 q5, q4, #9 - eor r8, r8, r10, ror #23 - vshl.u32 q7, q6, #9 - add r10, r5, r0 - vshr.u32 q4, q4, #32-9 - eor r9, r9, r11, ror #23 - vshr.u32 q6, q6, #32-9 - add r11, r2, r1 - veor.u32 q2, q2, q5 - eor r6, r6, r10, ror #23 - veor.u32 q10, q10, q7 - eor r7, r7, r11, ror #23 - veor.u32 q2, q2, q4 - strd r6, [sp, #10*4] - veor.u32 q10, q10, q6 - - add r2, r7, r2 - vadd.u32 q4, q2, q1 - ldrd r10, [sp, #14*4] - vadd.u32 q6, q10, q9 - add r3, r8, r3 - vext.u32 q1, q1, q1, #3 - eor r12, r12, r2, ror #19 - vshl.u32 q5, q4, #13 - add r2, r9, r4 - vext.u32 q9, q9, q9, #3 - eor lr, lr, r3, ror #19 - vshl.u32 q7, q6, #13 - add r3, r6, r5 - vshr.u32 q4, q4, #32-13 - eor r10, r10, r2, ror #19 - vshr.u32 q6, q6, #32-13 - eor r11, r11, r3, ror #19 - veor.u32 q3, q3, q5 - veor.u32 q11, q11, q7 - veor.u32 q3, q3, q4 - veor.u32 q11, q11, q6 - - ldrd r2, [sp, #2*4] - vadd.u32 q4, q3, q2 - add r6, r11, r6 - vadd.u32 q6, q11, q10 - add r7, r12, r7 - vswp.u32 d4, d5 - eor r0, r0, r6, ror #14 - vshl.u32 q5, q4, #18 - add r6, lr, r8 - vswp.u32 d20, d21 - eor r1, r1, r7, ror #14 - vshl.u32 q7, q6, #18 - add r7, r10, r9 - vext.u32 q3, q3, q3, #1 - eor r2, r2, r6, ror #14 - vshr.u32 q4, q4, #32-18 - eor r3, r3, r7, ror #14 - vshr.u32 q6, q6, #32-18 - strd r2, [sp, #2*4] - vext.u32 q11, q11, q11, #1 - strd r10, [sp, #14*4] - veor.u32 q0, q0, q5 - veor.u32 q8, q8, q7 - veor.u32 q0, q0, q4 - veor.u32 q8, q8, q6 -.endm +#define salsa8_core_3way_doubleround() \ + ldrd r6, [sp, #6*4]; \ + vadd.u32 q4, q0, q1; \ + add r6, r2, r6; \ + vadd.u32 q6, q8, q9; \ + add r7, r3, r7; \ + vshl.u32 q5, q4, #7; \ + eor r10, r10, r6, ror #25; \ + vshl.u32 q7, q6, #7; \ + add r6, r0, r4; \ + vshr.u32 q4, q4, #32-7; \ + eor r11, r11, r7, ror #25; \ + vshr.u32 q6, q6, #32-7; \ + add r7, r1, r5; \ + veor.u32 q3, q3, q5; \ + strd r10, [sp, #14*4]; \ + veor.u32 q11, q11, q7; \ + eor r12, r12, r6, ror #25; \ + veor.u32 q3, q3, q4; \ + eor lr, lr, r7, ror #25; \ + veor.u32 q11, q11, q6; \ + ldrd r6, [sp, #10*4]; \ + vadd.u32 q4, q3, q0; \ + add r2, r10, r2; \ + vadd.u32 q6, q11, q8; \ + add r3, r11, r3; \ + vshl.u32 q5, q4, #9; \ + eor r6, r6, r2, ror #23; \ + vshl.u32 q7, q6, #9; \ + add r2, r12, r0; \ + vshr.u32 q4, q4, #32-9; \ + eor r7, r7, r3, ror #23; \ + vshr.u32 q6, q6, #32-9; \ + add r3, lr, r1; \ + veor.u32 q2, q2, q5; \ + strd r6, [sp, #10*4]; \ + veor.u32 q10, q10, q7; \ + eor r8, r8, r2, ror #23; \ + veor.u32 q2, q2, q4; \ + eor r9, r9, r3, ror #23; \ + veor.u32 q10, q10, q6; \ + ldrd r2, [sp, #6*4]; \ + vadd.u32 q4, q2, q3; \ + add r10, r6, r10; \ + vadd.u32 q6, q10, q11; \ + add r11, r7, r11; \ + vext.u32 q3, q3, q3, #3; \ + eor r2, r2, r10, ror #19; \ + vshl.u32 q5, q4, #13; \ + add r10, r8, r12; \ + vext.u32 q11, q11, q11, #3; \ + eor r3, r3, r11, ror #19; \ + vshl.u32 q7, q6, #13; \ + add r11, r9, lr; \ + vshr.u32 q4, q4, #32-13; \ + eor r4, r4, r10, ror #19; \ + vshr.u32 q6, q6, #32-13; \ + eor r5, r5, r11, ror #19; \ + veor.u32 q1, q1, q5; \ + veor.u32 q9, q9, q7; \ + veor.u32 q1, q1, q4; \ + veor.u32 q9, q9, q6; \ + ldrd r10, [sp, #2*4]; \ + vadd.u32 q4, q1, q2; \ + add r6, r2, r6; \ + vadd.u32 q6, q9, q10; \ + add r7, r3, r7; \ + vswp.u32 d4, d5; \ + eor r10, r10, r6, ror #14; \ + vshl.u32 q5, q4, #18; \ + add r6, r4, r8; \ + vswp.u32 d20, d21; \ + eor r11, r11, r7, ror #14; \ + vshl.u32 q7, q6, #18; \ + add r7, r5, r9; \ + vshr.u32 q4, q4, #32-18; \ + eor r0, r0, r6, ror #14; \ + vshr.u32 q6, q6, #32-18; \ + eor r1, r1, r7, ror #14; \ + veor.u32 q0, q0, q5; \ + ldrd r6, [sp, #14*4]; \ + veor.u32 q8, q8, q7; \ + veor.u32 q0, q0, q4; \ + veor.u32 q8, q8, q6; \ + strd r2, [sp, #6*4]; \ + vadd.u32 q4, q0, q3; \ + strd r10, [sp, #2*4]; \ + vadd.u32 q6, q8, q11; \ + add r6, r11, r6; \ + vext.u32 q1, q1, q1, #1; \ + add r7, r0, r7; \ + vshl.u32 q5, q4, #7; \ + eor r4, r4, r6, ror #25; \ + vext.u32 q9, q9, q9, #1; \ + add r6, r1, r12; \ + vshl.u32 q7, q6, #7; \ + eor r5, r5, r7, ror #25; \ + vshr.u32 q4, q4, #32-7; \ + add r7, r10, lr; \ + vshr.u32 q6, q6, #32-7; \ + eor r2, r2, r6, ror #25; \ + veor.u32 q1, q1, q5; \ + eor r3, r3, r7, ror #25; \ + veor.u32 q9, q9, q7; \ + strd r2, [sp, #6*4]; \ + veor.u32 q1, q1, q4; \ + veor.u32 q9, q9, q6; \ + add r10, r3, r10; \ + vadd.u32 q4, q1, q0; \ + ldrd r6, [sp, #10*4]; \ + vadd.u32 q6, q9, q8; \ + add r11, r4, r11; \ + vshl.u32 q5, q4, #9; \ + eor r8, r8, r10, ror #23; \ + vshl.u32 q7, q6, #9; \ + add r10, r5, r0; \ + vshr.u32 q4, q4, #32-9; \ + eor r9, r9, r11, ror #23; \ + vshr.u32 q6, q6, #32-9; \ + add r11, r2, r1; \ + veor.u32 q2, q2, q5; \ + eor r6, r6, r10, ror #23; \ + veor.u32 q10, q10, q7; \ + eor r7, r7, r11, ror #23; \ + veor.u32 q2, q2, q4; \ + strd r6, [sp, #10*4]; \ + veor.u32 q10, q10, q6; \ + add r2, r7, r2; \ + vadd.u32 q4, q2, q1; \ + ldrd r10, [sp, #14*4]; \ + vadd.u32 q6, q10, q9; \ + add r3, r8, r3; \ + vext.u32 q1, q1, q1, #3; \ + eor r12, r12, r2, ror #19; \ + vshl.u32 q5, q4, #13; \ + add r2, r9, r4; \ + vext.u32 q9, q9, q9, #3; \ + eor lr, lr, r3, ror #19; \ + vshl.u32 q7, q6, #13; \ + add r3, r6, r5; \ + vshr.u32 q4, q4, #32-13; \ + eor r10, r10, r2, ror #19; \ + vshr.u32 q6, q6, #32-13; \ + eor r11, r11, r3, ror #19; \ + veor.u32 q3, q3, q5; \ + veor.u32 q11, q11, q7; \ + veor.u32 q3, q3, q4; \ + veor.u32 q11, q11, q6; \ + ldrd r2, [sp, #2*4]; \ + vadd.u32 q4, q3, q2; \ + add r6, r11, r6; \ + vadd.u32 q6, q11, q10; \ + add r7, r12, r7; \ + vswp.u32 d4, d5; \ + eor r0, r0, r6, ror #14; \ + vshl.u32 q5, q4, #18; \ + add r6, lr, r8; \ + vswp.u32 d20, d21; \ + eor r1, r1, r7, ror #14; \ + vshl.u32 q7, q6, #18; \ + add r7, r10, r9; \ + vext.u32 q3, q3, q3, #1; \ + eor r2, r2, r6, ror #14; \ + vshr.u32 q4, q4, #32-18; \ + eor r3, r3, r7, ror #14; \ + vshr.u32 q6, q6, #32-18; \ + strd r2, [sp, #2*4]; \ + vext.u32 q11, q11, q11, #1; \ + strd r10, [sp, #14*4]; \ + veor.u32 q0, q0, q5; \ + veor.u32 q8, q8, q7; \ + veor.u32 q0, q0, q4; \ + veor.u32 q8, q8, q6; \ + + +#define salsa8_core_3way() \ + ldmia sp, {r0-r12, lr}; \ + ldrd r10, [sp, #14*4]; \ + salsa8_core_3way_doubleround(); \ + salsa8_core_3way_doubleround(); \ + salsa8_core_3way_doubleround(); \ + salsa8_core_3way_doubleround(); \ + stmia sp, {r0-r5}; \ + strd r8, [sp, #8*4]; \ + str r12, [sp, #12*4]; \ + str lr, [sp, #13*4]; \ -.macro salsa8_core_3way - ldmia sp, {r0-r12, lr} - ldrd r10, [sp, #14*4] - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - salsa8_core_3way_doubleround - stmia sp, {r0-r5} - strd r8, [sp, #8*4] - str r12, [sp, #12*4] - str lr, [sp, #13*4] -.endm .text .code 32 @@ -866,11 +827,11 @@ scrypt_core_3way_loop1: add r3, r1, #16*4 str r1, [sp, #4*16+1*4] mov r12, sp - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 - scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() + scrypt_core_macro1a_x4() ldr r2, [sp, #4*16+3*4] - scrypt_core_macro1a_x4 + scrypt_core_macro1a_x4() sub r1, r1, #4*16 add r1, r1, r2, lsl #7 @@ -891,15 +852,15 @@ scrypt_core_3way_loop1: add r12, sp, #256 vstmia r12, {q8-q11} - salsa8_core_3way + salsa8_core_3way() ldr r0, [sp, #4*16+0*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() add lr, sp, #128 vldmia lr, {q4-q7} @@ -932,14 +893,14 @@ scrypt_core_3way_loop1: vmov q14, q10 vmov q15, q11 - salsa8_core_3way + salsa8_core_3way() ldr r0, [sp, #4*16+0*4] mov r1, sp add r0, r0, #16*4 - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 - scrypt_core_macro3_x4 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() + scrypt_core_macro3_x4() sub r0, r0, #8*16 ldr r1, [sp, #4*16+1*4] @@ -983,10 +944,10 @@ scrypt_core_3way_loop2: add r2, r0, #16*4 add r3, r1, #16*4 mov r12, sp - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 - scrypt_core_macro1b_x4 + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() + scrypt_core_macro1b_x4() ldr r1, [sp, #4*16+1*4] ldr r2, [sp, #4*16+3*4] @@ -1043,15 +1004,15 @@ scrypt_core_3way_loop2: veor.u32 q11, q11, q15 vstmia r12, {q8-q15} - salsa8_core_3way + salsa8_core_3way() ldr r0, [sp, #4*16+0*4] mov r12, sp add r2, r0, #16*4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 - scrypt_core_macro2_x4 + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() + scrypt_core_macro2_x4() add lr, sp, #128 vldmia lr, {q4-q7} @@ -1086,7 +1047,7 @@ scrypt_core_3way_loop2: vmov q14, q10 vmov q15, q11 - salsa8_core_3way + salsa8_core_3way() ldr r0, [sp, #4*16+0*4] ldr r3, [sp, #4*16+1*4] @@ -1094,15 +1055,15 @@ scrypt_core_3way_loop2: mov r1, sp add r0, r0, #16*4 sub r2, r2, #1 - scrypt_core_macro3_x4 + scrypt_core_macro3_x4() and r4, r4, r2 add r3, r3, r4, lsl #7 pld [r3, #16*4] pld [r3] pld [r3, #24*4] pld [r3, #8*4] - scrypt_core_macro3_x6 - scrypt_core_macro3_x6 + scrypt_core_macro3_x6() + scrypt_core_macro3_x6() add lr, sp, #128 add r4, sp, #128+4*16 diff --git a/scrypt-x64.S b/scrypt-x64.S index f9185d490..42c4ffd67 100644 --- a/scrypt-x64.S +++ b/scrypt-x64.S @@ -112,166 +112,152 @@ scrypt_best_throughput_exit: ret -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm +#define scrypt_shuffle(src, so, dest, do) \ + movl so+60(src), %eax; \ + movl so+44(src), %ebx; \ + movl so+28(src), %ecx; \ + movl so+12(src), %edx; \ + movl %eax, do+12(dest); \ + movl %ebx, do+28(dest); \ + movl %ecx, do+44(dest); \ + movl %edx, do+60(dest); \ + movl so+40(src), %eax; \ + movl so+8(src), %ebx; \ + movl so+48(src), %ecx; \ + movl so+16(src), %edx; \ + movl %eax, do+8(dest); \ + movl %ebx, do+40(dest); \ + movl %ecx, do+16(dest); \ + movl %edx, do+48(dest); \ + movl so+20(src), %eax; \ + movl so+4(src), %ebx; \ + movl so+52(src), %ecx; \ + movl so+36(src), %edx; \ + movl %eax, do+4(dest); \ + movl %ebx, do+20(dest); \ + movl %ecx, do+36(dest); \ + movl %edx, do+52(dest); \ + movl so+0(src), %eax; \ + movl so+24(src), %ebx; \ + movl so+32(src), %ecx; \ + movl so+56(src), %edx; \ + movl %eax, do+0(dest); \ + movl %ebx, do+24(dest); \ + movl %ecx, do+32(dest); \ + movl %edx, do+56(dest); \ -.macro salsa8_core_gen_doubleround - movq 72(%rsp), %r15 - - leaq (%r14, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %r9d - leaq (%rdi, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r10d - leaq (%rdx, %r9), %rbp - roll $9, %ebp - xorl %ebp, %r11d - leaq (%r15, %r10), %rbp - roll $9, %ebp - xorl %ebp, %r13d - - leaq (%r9, %r11), %rbp - roll $13, %ebp - xorl %ebp, %r14d - leaq (%r10, %r13), %rbp - roll $13, %ebp - xorl %ebp, %edi - leaq (%r11, %r14), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r13, %rdi), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%rax, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %ebx - leaq (%rbp, %rbx), %r15 - roll $9, %r15d - xorl %r15d, %ecx - leaq (%rbx, %rcx), %r15 - roll $13, %r15d - xorl %r15d, %eax - leaq (%rcx, %rax), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%r12, %r15), %rbp - roll $7, %ebp - xorl %ebp, %esi - leaq (%r15, %rsi), %rbp - roll $9, %ebp - xorl %ebp, %r8d - leaq (%rsi, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r12d - leaq (%r8, %r12), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) - movq 72(%rsp), %r15 - - leaq (%rsi, %rdx), %rbp - roll $7, %ebp - xorl %ebp, %edi - leaq (%r9, %r15), %rbp - roll $7, %ebp - xorl %ebp, %eax - leaq (%rdx, %rdi), %rbp - roll $9, %ebp - xorl %ebp, %ecx - leaq (%r15, %rax), %rbp - roll $9, %ebp - xorl %ebp, %r8d - - leaq (%rdi, %rcx), %rbp - roll $13, %ebp - xorl %ebp, %esi - leaq (%rax, %r8), %rbp - roll $13, %ebp - xorl %ebp, %r9d - leaq (%rcx, %rsi), %rbp - roll $18, %ebp - xorl %ebp, %edx - leaq (%r8, %r9), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq 48(%rsp), %rbp - movq %r15, 72(%rsp) - - leaq (%r10, %rbp), %r15 - roll $7, %r15d - xorl %r15d, %r12d - leaq (%rbp, %r12), %r15 - roll $9, %r15d - xorl %r15d, %r11d - leaq (%r12, %r11), %r15 - roll $13, %r15d - xorl %r15d, %r10d - leaq (%r11, %r10), %r15 - roll $18, %r15d - xorl %r15d, %ebp - - movq 88(%rsp), %r15 - movq %rbp, 48(%rsp) - - leaq (%rbx, %r15), %rbp - roll $7, %ebp - xorl %ebp, %r14d - leaq (%r15, %r14), %rbp - roll $9, %ebp - xorl %ebp, %r13d - leaq (%r14, %r13), %rbp - roll $13, %ebp - xorl %ebp, %ebx - leaq (%r13, %rbx), %rbp - roll $18, %ebp - xorl %ebp, %r15d - - movq %r15, 88(%rsp) -.endm + +#define salsa8_core_gen_doubleround() \ + movq 72(%rsp), %r15; \ + leaq (%r14, %rdx), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %r9d; \ + leaq (%rdi, %r15), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %r10d; \ + leaq (%rdx, %r9), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %r11d; \ + leaq (%r15, %r10), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %r13d; \ + leaq (%r9, %r11), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %r14d; \ + leaq (%r10, %r13), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %edi; \ + leaq (%r11, %r14), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %edx; \ + leaq (%r13, %rdi), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %r15d; \ + movq 48(%rsp), %rbp; \ + movq %r15, 72(%rsp); \ + leaq (%rax, %rbp), %r15; \ + roll $7, %r15d; \ + xorl %r15d, %ebx; \ + leaq (%rbp, %rbx), %r15; \ + roll $9, %r15d; \ + xorl %r15d, %ecx; \ + leaq (%rbx, %rcx), %r15; \ + roll $13, %r15d; \ + xorl %r15d, %eax; \ + leaq (%rcx, %rax), %r15; \ + roll $18, %r15d; \ + xorl %r15d, %ebp; \ + movq 88(%rsp), %r15; \ + movq %rbp, 48(%rsp); \ + leaq (%r12, %r15), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %esi; \ + leaq (%r15, %rsi), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %r8d; \ + leaq (%rsi, %r8), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %r12d; \ + leaq (%r8, %r12), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %r15d; \ + movq %r15, 88(%rsp); \ + movq 72(%rsp), %r15; \ + leaq (%rsi, %rdx), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %edi; \ + leaq (%r9, %r15), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %eax; \ + leaq (%rdx, %rdi), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %ecx; \ + leaq (%r15, %rax), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %r8d; \ + leaq (%rdi, %rcx), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %esi; \ + leaq (%rax, %r8), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %r9d; \ + leaq (%rcx, %rsi), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %edx; \ + leaq (%r8, %r9), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %r15d; \ + movq 48(%rsp), %rbp; \ + movq %r15, 72(%rsp); \ + leaq (%r10, %rbp), %r15; \ + roll $7, %r15d; \ + xorl %r15d, %r12d; \ + leaq (%rbp, %r12), %r15; \ + roll $9, %r15d; \ + xorl %r15d, %r11d; \ + leaq (%r12, %r11), %r15; \ + roll $13, %r15d; \ + xorl %r15d, %r10d; \ + leaq (%r11, %r10), %r15; \ + roll $18, %r15d; \ + xorl %r15d, %ebp; \ + movq 88(%rsp), %r15; \ + movq %rbp, 48(%rsp); \ + leaq (%rbx, %r15), %rbp; \ + roll $7, %ebp; \ + xorl %ebp, %r14d; \ + leaq (%r15, %r14), %rbp; \ + roll $9, %ebp; \ + xorl %ebp, %r13d; \ + leaq (%r14, %r13), %rbp; \ + roll $13, %ebp; \ + xorl %ebp, %ebx; \ + leaq (%r13, %rbx), %rbp; \ + roll $18, %ebp; \ + xorl %ebp, %r15d; \ + movq %r15, 88(%rsp); \ + .text .p2align 6 @@ -308,10 +294,10 @@ salsa8_core_gen: shrq $32, %r15 movq %r15, 88(%rsp) - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround - salsa8_core_gen_doubleround + salsa8_core_gen_doubleround() + salsa8_core_gen_doubleround() + salsa8_core_gen_doubleround() + salsa8_core_gen_doubleround() shlq $32, %rdi xorq %rdi, %rdx @@ -388,29 +374,14 @@ _scrypt_core: movq %rdx, %r8 #endif -.macro scrypt_core_cleanup -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %rbp - popq %rbx -.endm +#define scrypt_core_cleanup() \ + popq %r15; \ + popq %r14; \ + popq %r13; \ + popq %r12; \ + popq %rbp; \ + popq %rbx; \ + /* GenuineIntel processors have fast SIMD */ xorl %eax, %eax @@ -559,88 +530,81 @@ scrypt_core_gen_loop2: movdqa %xmm15, 112(%rdi) addq $136, %rsp - scrypt_core_cleanup + scrypt_core_cleanup() ret -.macro salsa8_core_xmm_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm +#define salsa8_core_xmm_doubleround() \ + movdqa %xmm1, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm0, %xmm4; \ + pxor %xmm5, %xmm3; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pxor %xmm5, %xmm2; \ + pshufd $0x93, %xmm3, %xmm3; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pxor %xmm5, %xmm1; \ + pshufd $0x4e, %xmm2, %xmm2; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + movdqa %xmm3, %xmm4; \ + pxor %xmm5, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm0, %xmm4; \ + pxor %xmm5, %xmm1; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pxor %xmm5, %xmm2; \ + pshufd $0x93, %xmm1, %xmm1; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pxor %xmm5, %xmm3; \ + pshufd $0x4e, %xmm2, %xmm2; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + + +#define salsa8_core_xmm() \ + salsa8_core_xmm_doubleround(); \ + salsa8_core_xmm_doubleround(); \ + salsa8_core_xmm_doubleround(); \ + salsa8_core_xmm_doubleround(); \ -.macro salsa8_core_xmm - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround - salsa8_core_xmm_doubleround -.endm .p2align 6 scrypt_core_xmm: @@ -721,7 +685,7 @@ scrypt_core_xmm_loop1: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - salsa8_core_xmm + salsa8_core_xmm() paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -735,7 +699,7 @@ scrypt_core_xmm_loop1: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - salsa8_core_xmm + salsa8_core_xmm() paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 @@ -764,7 +728,7 @@ scrypt_core_xmm_loop2: movdqa %xmm9, %xmm1 movdqa %xmm10, %xmm2 movdqa %xmm11, %xmm3 - salsa8_core_xmm + salsa8_core_xmm() paddd %xmm0, %xmm8 paddd %xmm1, %xmm9 paddd %xmm2, %xmm10 @@ -782,7 +746,7 @@ scrypt_core_xmm_loop2: movdqa %xmm13, %xmm1 movdqa %xmm14, %xmm2 movdqa %xmm15, %xmm3 - salsa8_core_xmm + salsa8_core_xmm() paddd %xmm0, %xmm12 paddd %xmm1, %xmm13 paddd %xmm2, %xmm14 @@ -846,165 +810,158 @@ scrypt_core_xmm_loop2: movdqa %xmm14, 96(%rdi) movdqa %xmm13, 112(%rdi) - scrypt_core_cleanup + scrypt_core_cleanup() ret #if defined(USE_AVX) -.macro salsa8_core_3way_avx_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 - - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vpslld $7, %xmm4, %xmm5 - vpsrld $25, %xmm4, %xmm4 - vpshufd $0x39, %xmm1, %xmm1 - vpxor %xmm5, %xmm1, %xmm1 - vpxor %xmm4, %xmm1, %xmm1 - vpslld $7, %xmm6, %xmm5 - vpsrld $25, %xmm6, %xmm6 - vpshufd $0x39, %xmm9, %xmm9 - vpxor %xmm5, %xmm9, %xmm9 - vpxor %xmm6, %xmm9, %xmm9 - vpslld $7, %xmm7, %xmm5 - vpsrld $25, %xmm7, %xmm7 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm5, %xmm13, %xmm13 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vpslld $9, %xmm4, %xmm5 - vpsrld $23, %xmm4, %xmm4 - vpxor %xmm5, %xmm2, %xmm2 - vpxor %xmm4, %xmm2, %xmm2 - vpslld $9, %xmm6, %xmm5 - vpsrld $23, %xmm6, %xmm6 - vpxor %xmm5, %xmm10, %xmm10 - vpxor %xmm6, %xmm10, %xmm10 - vpslld $9, %xmm7, %xmm5 - vpsrld $23, %xmm7, %xmm7 - vpxor %xmm5, %xmm14, %xmm14 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vpslld $13, %xmm4, %xmm5 - vpsrld $19, %xmm4, %xmm4 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm5, %xmm3, %xmm3 - vpxor %xmm4, %xmm3, %xmm3 - vpslld $13, %xmm6, %xmm5 - vpsrld $19, %xmm6, %xmm6 - vpxor %xmm5, %xmm11, %xmm11 - vpxor %xmm6, %xmm11, %xmm11 - vpslld $13, %xmm7, %xmm5 - vpsrld $19, %xmm7, %xmm7 - vpxor %xmm5, %xmm15, %xmm15 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vpslld $18, %xmm4, %xmm5 - vpsrld $14, %xmm4, %xmm4 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpxor %xmm5, %xmm0, %xmm0 - vpxor %xmm4, %xmm0, %xmm0 - vpslld $18, %xmm6, %xmm5 - vpsrld $14, %xmm6, %xmm6 - vpshufd $0x4e, %xmm14, %xmm14 - vpshufd $0x39, %xmm11, %xmm11 - vpxor %xmm5, %xmm8, %xmm8 - vpxor %xmm6, %xmm8, %xmm8 - vpslld $18, %xmm7, %xmm5 - vpsrld $14, %xmm7, %xmm7 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm15, %xmm15 - vpxor %xmm5, %xmm12, %xmm12 - vpxor %xmm7, %xmm12, %xmm12 -.endm +#define salsa8_core_3way_avx_doubleround() \ + vpaddd %xmm0, %xmm1, %xmm4; \ + vpaddd %xmm8, %xmm9, %xmm6; \ + vpaddd %xmm12, %xmm13, %xmm7; \ + vpslld $7, %xmm4, %xmm5; \ + vpsrld $25, %xmm4, %xmm4; \ + vpxor %xmm5, %xmm3, %xmm3; \ + vpxor %xmm4, %xmm3, %xmm3; \ + vpslld $7, %xmm6, %xmm5; \ + vpsrld $25, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm11, %xmm11; \ + vpxor %xmm6, %xmm11, %xmm11; \ + vpslld $7, %xmm7, %xmm5; \ + vpsrld $25, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm15, %xmm15; \ + vpxor %xmm7, %xmm15, %xmm15; \ + vpaddd %xmm3, %xmm0, %xmm4; \ + vpaddd %xmm11, %xmm8, %xmm6; \ + vpaddd %xmm15, %xmm12, %xmm7; \ + vpslld $9, %xmm4, %xmm5; \ + vpsrld $23, %xmm4, %xmm4; \ + vpxor %xmm5, %xmm2, %xmm2; \ + vpxor %xmm4, %xmm2, %xmm2; \ + vpslld $9, %xmm6, %xmm5; \ + vpsrld $23, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm10, %xmm10; \ + vpxor %xmm6, %xmm10, %xmm10; \ + vpslld $9, %xmm7, %xmm5; \ + vpsrld $23, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm14, %xmm14; \ + vpxor %xmm7, %xmm14, %xmm14; \ + vpaddd %xmm2, %xmm3, %xmm4; \ + vpaddd %xmm10, %xmm11, %xmm6; \ + vpaddd %xmm14, %xmm15, %xmm7; \ + vpslld $13, %xmm4, %xmm5; \ + vpsrld $19, %xmm4, %xmm4; \ + vpshufd $0x93, %xmm3, %xmm3; \ + vpshufd $0x93, %xmm11, %xmm11; \ + vpshufd $0x93, %xmm15, %xmm15; \ + vpxor %xmm5, %xmm1, %xmm1; \ + vpxor %xmm4, %xmm1, %xmm1; \ + vpslld $13, %xmm6, %xmm5; \ + vpsrld $19, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm9, %xmm9; \ + vpxor %xmm6, %xmm9, %xmm9; \ + vpslld $13, %xmm7, %xmm5; \ + vpsrld $19, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm13, %xmm13; \ + vpxor %xmm7, %xmm13, %xmm13; \ + vpaddd %xmm1, %xmm2, %xmm4; \ + vpaddd %xmm9, %xmm10, %xmm6; \ + vpaddd %xmm13, %xmm14, %xmm7; \ + vpslld $18, %xmm4, %xmm5; \ + vpsrld $14, %xmm4, %xmm4; \ + vpshufd $0x4e, %xmm2, %xmm2; \ + vpshufd $0x4e, %xmm10, %xmm10; \ + vpshufd $0x4e, %xmm14, %xmm14; \ + vpxor %xmm5, %xmm0, %xmm0; \ + vpxor %xmm4, %xmm0, %xmm0; \ + vpslld $18, %xmm6, %xmm5; \ + vpsrld $14, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm8, %xmm8; \ + vpxor %xmm6, %xmm8, %xmm8; \ + vpslld $18, %xmm7, %xmm5; \ + vpsrld $14, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm12, %xmm12; \ + vpxor %xmm7, %xmm12, %xmm12; \ + vpaddd %xmm0, %xmm3, %xmm4; \ + vpaddd %xmm8, %xmm11, %xmm6; \ + vpaddd %xmm12, %xmm15, %xmm7; \ + vpslld $7, %xmm4, %xmm5; \ + vpsrld $25, %xmm4, %xmm4; \ + vpshufd $0x39, %xmm1, %xmm1; \ + vpxor %xmm5, %xmm1, %xmm1; \ + vpxor %xmm4, %xmm1, %xmm1; \ + vpslld $7, %xmm6, %xmm5; \ + vpsrld $25, %xmm6, %xmm6; \ + vpshufd $0x39, %xmm9, %xmm9; \ + vpxor %xmm5, %xmm9, %xmm9; \ + vpxor %xmm6, %xmm9, %xmm9; \ + vpslld $7, %xmm7, %xmm5; \ + vpsrld $25, %xmm7, %xmm7; \ + vpshufd $0x39, %xmm13, %xmm13; \ + vpxor %xmm5, %xmm13, %xmm13; \ + vpxor %xmm7, %xmm13, %xmm13; \ + vpaddd %xmm1, %xmm0, %xmm4; \ + vpaddd %xmm9, %xmm8, %xmm6; \ + vpaddd %xmm13, %xmm12, %xmm7; \ + vpslld $9, %xmm4, %xmm5; \ + vpsrld $23, %xmm4, %xmm4; \ + vpxor %xmm5, %xmm2, %xmm2; \ + vpxor %xmm4, %xmm2, %xmm2; \ + vpslld $9, %xmm6, %xmm5; \ + vpsrld $23, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm10, %xmm10; \ + vpxor %xmm6, %xmm10, %xmm10; \ + vpslld $9, %xmm7, %xmm5; \ + vpsrld $23, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm14, %xmm14; \ + vpxor %xmm7, %xmm14, %xmm14; \ + vpaddd %xmm2, %xmm1, %xmm4; \ + vpaddd %xmm10, %xmm9, %xmm6; \ + vpaddd %xmm14, %xmm13, %xmm7; \ + vpslld $13, %xmm4, %xmm5; \ + vpsrld $19, %xmm4, %xmm4; \ + vpshufd $0x93, %xmm1, %xmm1; \ + vpshufd $0x93, %xmm9, %xmm9; \ + vpshufd $0x93, %xmm13, %xmm13; \ + vpxor %xmm5, %xmm3, %xmm3; \ + vpxor %xmm4, %xmm3, %xmm3; \ + vpslld $13, %xmm6, %xmm5; \ + vpsrld $19, %xmm6, %xmm6; \ + vpxor %xmm5, %xmm11, %xmm11; \ + vpxor %xmm6, %xmm11, %xmm11; \ + vpslld $13, %xmm7, %xmm5; \ + vpsrld $19, %xmm7, %xmm7; \ + vpxor %xmm5, %xmm15, %xmm15; \ + vpxor %xmm7, %xmm15, %xmm15; \ + vpaddd %xmm3, %xmm2, %xmm4; \ + vpaddd %xmm11, %xmm10, %xmm6; \ + vpaddd %xmm15, %xmm14, %xmm7; \ + vpslld $18, %xmm4, %xmm5; \ + vpsrld $14, %xmm4, %xmm4; \ + vpshufd $0x4e, %xmm2, %xmm2; \ + vpshufd $0x4e, %xmm10, %xmm10; \ + vpxor %xmm5, %xmm0, %xmm0; \ + vpxor %xmm4, %xmm0, %xmm0; \ + vpslld $18, %xmm6, %xmm5; \ + vpsrld $14, %xmm6, %xmm6; \ + vpshufd $0x4e, %xmm14, %xmm14; \ + vpshufd $0x39, %xmm11, %xmm11; \ + vpxor %xmm5, %xmm8, %xmm8; \ + vpxor %xmm6, %xmm8, %xmm8; \ + vpslld $18, %xmm7, %xmm5; \ + vpsrld $14, %xmm7, %xmm7; \ + vpshufd $0x39, %xmm3, %xmm3; \ + vpshufd $0x39, %xmm15, %xmm15; \ + vpxor %xmm5, %xmm12, %xmm12; \ + vpxor %xmm7, %xmm12, %xmm12; \ + + +#define salsa8_core_3way_avx() \ + salsa8_core_3way_avx_doubleround(); \ + salsa8_core_3way_avx_doubleround(); \ + salsa8_core_3way_avx_doubleround(); \ + salsa8_core_3way_avx_doubleround(); \ -.macro salsa8_core_3way_avx - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround - salsa8_core_3way_avx_doubleround -.endm #endif /* USE_AVX */ .text @@ -1036,26 +993,11 @@ _scrypt_core_3way: #endif subq $392, %rsp -.macro scrypt_core_3way_cleanup - addq $392, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi - movdqa 8(%rsp), %xmm6 - movdqa 24(%rsp), %xmm7 - movdqa 40(%rsp), %xmm8 - movdqa 56(%rsp), %xmm9 - movdqa 72(%rsp), %xmm10 - movdqa 88(%rsp), %xmm11 - movdqa 104(%rsp), %xmm12 - movdqa 120(%rsp), %xmm13 - movdqa 136(%rsp), %xmm14 - movdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %rbp - popq %rbx -.endm +#define scrypt_core_3way_cleanup() \ + addq $392, %rsp; \ + popq %rbp; \ + popq %rbx; \ + #if !defined(USE_AVX) jmp scrypt_core_3way_xmm @@ -1081,12 +1023,12 @@ _scrypt_core_3way: #endif scrypt_core_3way_avx: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + scrypt_shuffle(%rdi, 0, %rsp, 0) + scrypt_shuffle(%rdi, 64, %rsp, 64) + scrypt_shuffle(%rdi, 128, %rsp, 128) + scrypt_shuffle(%rdi, 192, %rsp, 192) + scrypt_shuffle(%rdi, 256, %rsp, 256) + scrypt_shuffle(%rdi, 320, %rsp, 320) movdqa 64(%rsp), %xmm0 movdqa 80(%rsp), %xmm1 @@ -1143,7 +1085,7 @@ scrypt_core_3way_avx_loop1: movdqa %xmm14, 256+32(%rbx) movdqa %xmm15, 256+48(%rbx) - salsa8_core_3way_avx + salsa8_core_3way_avx() paddd 0(%rbx), %xmm0 paddd 16(%rbx), %xmm1 paddd 32(%rbx), %xmm2 @@ -1193,7 +1135,7 @@ scrypt_core_3way_avx_loop1: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx + salsa8_core_3way_avx() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1276,7 +1218,7 @@ scrypt_core_3way_avx_loop2: movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_avx + salsa8_core_3way_avx() paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 @@ -1338,7 +1280,7 @@ scrypt_core_3way_avx_loop2: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_avx + salsa8_core_3way_avx() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1367,132 +1309,125 @@ scrypt_core_3way_avx_loop2: subq $1, %rcx ja scrypt_core_3way_avx_loop2 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 + scrypt_shuffle(%rsp, 0, %rdi, 0) + scrypt_shuffle(%rsp, 64, %rdi, 64) + scrypt_shuffle(%rsp, 128, %rdi, 128) + scrypt_shuffle(%rsp, 192, %rdi, 192) + scrypt_shuffle(%rsp, 256, %rdi, 256) + scrypt_shuffle(%rsp, 320, %rdi, 320) - scrypt_core_3way_cleanup + scrypt_core_3way_cleanup() ret #if defined(USE_XOP) -.macro salsa8_core_3way_xop_doubleround - vpaddd %xmm0, %xmm1, %xmm4 - vpaddd %xmm8, %xmm9, %xmm6 - vpaddd %xmm12, %xmm13, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm0, %xmm4 - vpaddd %xmm11, %xmm8, %xmm6 - vpaddd %xmm15, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm3, %xmm4 - vpaddd %xmm10, %xmm11, %xmm6 - vpaddd %xmm14, %xmm15, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm3, %xmm3 - vpshufd $0x93, %xmm11, %xmm11 - vpshufd $0x93, %xmm15, %xmm15 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm2, %xmm4 - vpaddd %xmm9, %xmm10, %xmm6 - vpaddd %xmm13, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm7, %xmm12, %xmm12 - - vpaddd %xmm0, %xmm3, %xmm4 - vpaddd %xmm8, %xmm11, %xmm6 - vpaddd %xmm12, %xmm15, %xmm7 - vprotd $7, %xmm4, %xmm4 - vprotd $7, %xmm6, %xmm6 - vprotd $7, %xmm7, %xmm7 - vpshufd $0x39, %xmm1, %xmm1 - vpshufd $0x39, %xmm9, %xmm9 - vpshufd $0x39, %xmm13, %xmm13 - vpxor %xmm4, %xmm1, %xmm1 - vpxor %xmm6, %xmm9, %xmm9 - vpxor %xmm7, %xmm13, %xmm13 - - vpaddd %xmm1, %xmm0, %xmm4 - vpaddd %xmm9, %xmm8, %xmm6 - vpaddd %xmm13, %xmm12, %xmm7 - vprotd $9, %xmm4, %xmm4 - vprotd $9, %xmm6, %xmm6 - vprotd $9, %xmm7, %xmm7 - vpxor %xmm4, %xmm2, %xmm2 - vpxor %xmm6, %xmm10, %xmm10 - vpxor %xmm7, %xmm14, %xmm14 - - vpaddd %xmm2, %xmm1, %xmm4 - vpaddd %xmm10, %xmm9, %xmm6 - vpaddd %xmm14, %xmm13, %xmm7 - vprotd $13, %xmm4, %xmm4 - vprotd $13, %xmm6, %xmm6 - vprotd $13, %xmm7, %xmm7 - vpshufd $0x93, %xmm1, %xmm1 - vpshufd $0x93, %xmm9, %xmm9 - vpshufd $0x93, %xmm13, %xmm13 - vpxor %xmm4, %xmm3, %xmm3 - vpxor %xmm6, %xmm11, %xmm11 - vpxor %xmm7, %xmm15, %xmm15 - - vpaddd %xmm3, %xmm2, %xmm4 - vpaddd %xmm11, %xmm10, %xmm6 - vpaddd %xmm15, %xmm14, %xmm7 - vprotd $18, %xmm4, %xmm4 - vprotd $18, %xmm6, %xmm6 - vprotd $18, %xmm7, %xmm7 - vpshufd $0x4e, %xmm2, %xmm2 - vpshufd $0x4e, %xmm10, %xmm10 - vpshufd $0x4e, %xmm14, %xmm14 - vpxor %xmm4, %xmm0, %xmm0 - vpxor %xmm6, %xmm8, %xmm8 - vpxor %xmm7, %xmm12, %xmm12 - vpshufd $0x39, %xmm3, %xmm3 - vpshufd $0x39, %xmm11, %xmm11 - vpshufd $0x39, %xmm15, %xmm15 -.endm +#define salsa8_core_3way_xop_doubleround() \ + vpaddd %xmm0, %xmm1, %xmm4; \ + vpaddd %xmm8, %xmm9, %xmm6; \ + vpaddd %xmm12, %xmm13, %xmm7; \ + vprotd $7, %xmm4, %xmm4; \ + vprotd $7, %xmm6, %xmm6; \ + vprotd $7, %xmm7, %xmm7; \ + vpxor %xmm4, %xmm3, %xmm3; \ + vpxor %xmm6, %xmm11, %xmm11; \ + vpxor %xmm7, %xmm15, %xmm15; \ + vpaddd %xmm3, %xmm0, %xmm4; \ + vpaddd %xmm11, %xmm8, %xmm6; \ + vpaddd %xmm15, %xmm12, %xmm7; \ + vprotd $9, %xmm4, %xmm4; \ + vprotd $9, %xmm6, %xmm6; \ + vprotd $9, %xmm7, %xmm7; \ + vpxor %xmm4, %xmm2, %xmm2; \ + vpxor %xmm6, %xmm10, %xmm10; \ + vpxor %xmm7, %xmm14, %xmm14; \ + vpaddd %xmm2, %xmm3, %xmm4; \ + vpaddd %xmm10, %xmm11, %xmm6; \ + vpaddd %xmm14, %xmm15, %xmm7; \ + vprotd $13, %xmm4, %xmm4; \ + vprotd $13, %xmm6, %xmm6; \ + vprotd $13, %xmm7, %xmm7; \ + vpshufd $0x93, %xmm3, %xmm3; \ + vpshufd $0x93, %xmm11, %xmm11; \ + vpshufd $0x93, %xmm15, %xmm15; \ + vpxor %xmm4, %xmm1, %xmm1; \ + vpxor %xmm6, %xmm9, %xmm9; \ + vpxor %xmm7, %xmm13, %xmm13; \ + vpaddd %xmm1, %xmm2, %xmm4; \ + vpaddd %xmm9, %xmm10, %xmm6; \ + vpaddd %xmm13, %xmm14, %xmm7; \ + vprotd $18, %xmm4, %xmm4; \ + vprotd $18, %xmm6, %xmm6; \ + vprotd $18, %xmm7, %xmm7; \ + vpshufd $0x4e, %xmm2, %xmm2; \ + vpshufd $0x4e, %xmm10, %xmm10; \ + vpshufd $0x4e, %xmm14, %xmm14; \ + vpxor %xmm6, %xmm8, %xmm8; \ + vpxor %xmm4, %xmm0, %xmm0; \ + vpxor %xmm7, %xmm12, %xmm12; \ + vpaddd %xmm0, %xmm3, %xmm4; \ + vpaddd %xmm8, %xmm11, %xmm6; \ + vpaddd %xmm12, %xmm15, %xmm7; \ + vprotd $7, %xmm4, %xmm4; \ + vprotd $7, %xmm6, %xmm6; \ + vprotd $7, %xmm7, %xmm7; \ + vpshufd $0x39, %xmm1, %xmm1; \ + vpshufd $0x39, %xmm9, %xmm9; \ + vpshufd $0x39, %xmm13, %xmm13; \ + vpxor %xmm4, %xmm1, %xmm1; \ + vpxor %xmm6, %xmm9, %xmm9; \ + vpxor %xmm7, %xmm13, %xmm13; \ + vpaddd %xmm1, %xmm0, %xmm4; \ + vpaddd %xmm9, %xmm8, %xmm6; \ + vpaddd %xmm13, %xmm12, %xmm7; \ + vprotd $9, %xmm4, %xmm4; \ + vprotd $9, %xmm6, %xmm6; \ + vprotd $9, %xmm7, %xmm7; \ + vpxor %xmm4, %xmm2, %xmm2; \ + vpxor %xmm6, %xmm10, %xmm10; \ + vpxor %xmm7, %xmm14, %xmm14; \ + vpaddd %xmm2, %xmm1, %xmm4; \ + vpaddd %xmm10, %xmm9, %xmm6; \ + vpaddd %xmm14, %xmm13, %xmm7; \ + vprotd $13, %xmm4, %xmm4; \ + vprotd $13, %xmm6, %xmm6; \ + vprotd $13, %xmm7, %xmm7; \ + vpshufd $0x93, %xmm1, %xmm1; \ + vpshufd $0x93, %xmm9, %xmm9; \ + vpshufd $0x93, %xmm13, %xmm13; \ + vpxor %xmm4, %xmm3, %xmm3; \ + vpxor %xmm6, %xmm11, %xmm11; \ + vpxor %xmm7, %xmm15, %xmm15; \ + vpaddd %xmm3, %xmm2, %xmm4; \ + vpaddd %xmm11, %xmm10, %xmm6; \ + vpaddd %xmm15, %xmm14, %xmm7; \ + vprotd $18, %xmm4, %xmm4; \ + vprotd $18, %xmm6, %xmm6; \ + vprotd $18, %xmm7, %xmm7; \ + vpshufd $0x4e, %xmm2, %xmm2; \ + vpshufd $0x4e, %xmm10, %xmm10; \ + vpshufd $0x4e, %xmm14, %xmm14; \ + vpxor %xmm4, %xmm0, %xmm0; \ + vpxor %xmm6, %xmm8, %xmm8; \ + vpxor %xmm7, %xmm12, %xmm12; \ + vpshufd $0x39, %xmm3, %xmm3; \ + vpshufd $0x39, %xmm11, %xmm11; \ + vpshufd $0x39, %xmm15, %xmm15; \ + + +#define salsa8_core_3way_xop() \ + salsa8_core_3way_xop_doubleround(); \ + salsa8_core_3way_xop_doubleround(); \ + salsa8_core_3way_xop_doubleround(); \ + salsa8_core_3way_xop_doubleround(); \ -.macro salsa8_core_3way_xop - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround - salsa8_core_3way_xop_doubleround -.endm .p2align 6 scrypt_core_3way_xop: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + scrypt_shuffle(%rdi, 0, %rsp, 0) + scrypt_shuffle(%rdi, 64, %rsp, 64) + scrypt_shuffle(%rdi, 128, %rsp, 128) + scrypt_shuffle(%rdi, 192, %rsp, 192) + scrypt_shuffle(%rdi, 256, %rsp, 256) + scrypt_shuffle(%rdi, 320, %rsp, 320) movdqa 64(%rsp), %xmm0 movdqa 80(%rsp), %xmm1 @@ -1549,7 +1484,7 @@ scrypt_core_3way_xop_loop1: movdqa %xmm14, 256+32(%rbx) movdqa %xmm15, 256+48(%rbx) - salsa8_core_3way_xop + salsa8_core_3way_xop() paddd 0(%rbx), %xmm0 paddd 16(%rbx), %xmm1 paddd 32(%rbx), %xmm2 @@ -1599,7 +1534,7 @@ scrypt_core_3way_xop_loop1: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop + salsa8_core_3way_xop() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1682,7 +1617,7 @@ scrypt_core_3way_xop_loop2: movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xop + salsa8_core_3way_xop() paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 @@ -1744,7 +1679,7 @@ scrypt_core_3way_xop_loop2: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xop + salsa8_core_3way_xop() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -1773,229 +1708,222 @@ scrypt_core_3way_xop_loop2: subq $1, %rcx ja scrypt_core_3way_xop_loop2 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 + scrypt_shuffle(%rsp, 0, %rdi, 0) + scrypt_shuffle(%rsp, 64, %rdi, 64) + scrypt_shuffle(%rsp, 128, %rdi, 128) + scrypt_shuffle(%rsp, 192, %rdi, 192) + scrypt_shuffle(%rsp, 256, %rdi, 256) + scrypt_shuffle(%rsp, 320, %rdi, 320) - scrypt_core_3way_cleanup + scrypt_core_3way_cleanup() ret #endif /* USE_XOP */ #endif /* USE_AVX */ -.macro salsa8_core_3way_xmm_doubleround - movdqa %xmm1, %xmm4 - movdqa %xmm9, %xmm6 - movdqa %xmm13, %xmm7 - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - pxor %xmm5, %xmm3 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm11 - pxor %xmm5, %xmm11 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm15 - pxor %xmm5, %xmm15 - movdqa %xmm12, %xmm7 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pshufd $0x93, %xmm3, %xmm3 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm11, %xmm6 - pshufd $0x93, %xmm11, %xmm11 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm14 - pshufd $0x93, %xmm15, %xmm15 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm1 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm9 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm9 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm13 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm13 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - pxor %xmm5, %xmm0 - movdqa %xmm3, %xmm4 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm9, %xmm9 - pxor %xmm5, %xmm8 - movdqa %xmm11, %xmm6 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - movdqa %xmm15, %xmm7 - pxor %xmm5, %xmm12 - pshufd $0x39, %xmm13, %xmm13 - - paddd %xmm0, %xmm4 - paddd %xmm8, %xmm6 - paddd %xmm12, %xmm7 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - pxor %xmm5, %xmm1 - movdqa %xmm0, %xmm4 - movdqa %xmm6, %xmm5 - pslld $7, %xmm6 - psrld $25, %xmm5 - pxor %xmm6, %xmm9 - pxor %xmm5, %xmm9 - movdqa %xmm8, %xmm6 - movdqa %xmm7, %xmm5 - pslld $7, %xmm7 - psrld $25, %xmm5 - pxor %xmm7, %xmm13 - pxor %xmm5, %xmm13 - movdqa %xmm12, %xmm7 - - paddd %xmm1, %xmm4 - paddd %xmm9, %xmm6 - paddd %xmm13, %xmm7 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pshufd $0x93, %xmm1, %xmm1 - pxor %xmm5, %xmm2 - movdqa %xmm6, %xmm5 - pslld $9, %xmm6 - psrld $23, %xmm5 - pxor %xmm6, %xmm10 - movdqa %xmm9, %xmm6 - pshufd $0x93, %xmm9, %xmm9 - pxor %xmm5, %xmm10 - movdqa %xmm7, %xmm5 - pslld $9, %xmm7 - psrld $23, %xmm5 - pxor %xmm7, %xmm14 - movdqa %xmm13, %xmm7 - pshufd $0x93, %xmm13, %xmm13 - pxor %xmm5, %xmm14 - - paddd %xmm2, %xmm4 - paddd %xmm10, %xmm6 - paddd %xmm14, %xmm7 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pshufd $0x4e, %xmm2, %xmm2 - pxor %xmm5, %xmm3 - movdqa %xmm6, %xmm5 - pslld $13, %xmm6 - psrld $19, %xmm5 - pxor %xmm6, %xmm11 - movdqa %xmm10, %xmm6 - pshufd $0x4e, %xmm10, %xmm10 - pxor %xmm5, %xmm11 - movdqa %xmm7, %xmm5 - pslld $13, %xmm7 - psrld $19, %xmm5 - pxor %xmm7, %xmm15 - movdqa %xmm14, %xmm7 - pshufd $0x4e, %xmm14, %xmm14 - pxor %xmm5, %xmm15 - - paddd %xmm3, %xmm4 - paddd %xmm11, %xmm6 - paddd %xmm15, %xmm7 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 - movdqa %xmm6, %xmm5 - pslld $18, %xmm6 - psrld $14, %xmm5 - pxor %xmm6, %xmm8 - pshufd $0x39, %xmm11, %xmm11 - pxor %xmm5, %xmm8 - movdqa %xmm7, %xmm5 - pslld $18, %xmm7 - psrld $14, %xmm5 - pxor %xmm7, %xmm12 - pshufd $0x39, %xmm15, %xmm15 - pxor %xmm5, %xmm12 -.endm +#define salsa8_core_3way_xmm_doubleround() \ + movdqa %xmm1, %xmm4; \ + movdqa %xmm9, %xmm6; \ + movdqa %xmm13, %xmm7; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + paddd %xmm12, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + pxor %xmm6, %xmm11; \ + pxor %xmm5, %xmm11; \ + movdqa %xmm8, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $7, %xmm7; \ + psrld $25, %xmm5; \ + pxor %xmm7, %xmm15; \ + pxor %xmm5, %xmm15; \ + movdqa %xmm12, %xmm7; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + paddd %xmm15, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pshufd $0x93, %xmm3, %xmm3; \ + pxor %xmm5, %xmm2; \ + movdqa %xmm6, %xmm5; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm11, %xmm6; \ + pshufd $0x93, %xmm11, %xmm11; \ + pxor %xmm5, %xmm10; \ + movdqa %xmm7, %xmm5; \ + pslld $9, %xmm7; \ + psrld $23, %xmm5; \ + pxor %xmm7, %xmm14; \ + movdqa %xmm15, %xmm7; \ + pxor %xmm5, %xmm14; \ + pshufd $0x93, %xmm15, %xmm15; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + paddd %xmm14, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm6, %xmm5; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + pxor %xmm6, %xmm9; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm9; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm7; \ + psrld $19, %xmm5; \ + pxor %xmm7, %xmm13; \ + movdqa %xmm14, %xmm7; \ + pshufd $0x4e, %xmm14, %xmm14; \ + pxor %xmm5, %xmm13; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + paddd %xmm13, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm3, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm9, %xmm9; \ + pxor %xmm5, %xmm8; \ + movdqa %xmm11, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $18, %xmm7; \ + psrld $14, %xmm5; \ + pxor %xmm7, %xmm12; \ + movdqa %xmm15, %xmm7; \ + pxor %xmm5, %xmm12; \ + pshufd $0x39, %xmm13, %xmm13; \ + paddd %xmm0, %xmm4; \ + paddd %xmm8, %xmm6; \ + paddd %xmm12, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + pxor %xmm5, %xmm1; \ + movdqa %xmm0, %xmm4; \ + movdqa %xmm6, %xmm5; \ + pslld $7, %xmm6; \ + psrld $25, %xmm5; \ + pxor %xmm6, %xmm9; \ + pxor %xmm5, %xmm9; \ + movdqa %xmm8, %xmm6; \ + movdqa %xmm7, %xmm5; \ + pslld $7, %xmm7; \ + psrld $25, %xmm5; \ + pxor %xmm7, %xmm13; \ + pxor %xmm5, %xmm13; \ + movdqa %xmm12, %xmm7; \ + paddd %xmm1, %xmm4; \ + paddd %xmm9, %xmm6; \ + paddd %xmm13, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pshufd $0x93, %xmm1, %xmm1; \ + pxor %xmm5, %xmm2; \ + movdqa %xmm6, %xmm5; \ + pslld $9, %xmm6; \ + psrld $23, %xmm5; \ + pxor %xmm6, %xmm10; \ + movdqa %xmm9, %xmm6; \ + pshufd $0x93, %xmm9, %xmm9; \ + pxor %xmm5, %xmm10; \ + movdqa %xmm7, %xmm5; \ + pslld $9, %xmm7; \ + psrld $23, %xmm5; \ + pxor %xmm7, %xmm14; \ + movdqa %xmm13, %xmm7; \ + pshufd $0x93, %xmm13, %xmm13; \ + pxor %xmm5, %xmm14; \ + paddd %xmm2, %xmm4; \ + paddd %xmm10, %xmm6; \ + paddd %xmm14, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pshufd $0x4e, %xmm2, %xmm2; \ + pxor %xmm5, %xmm3; \ + movdqa %xmm6, %xmm5; \ + pslld $13, %xmm6; \ + psrld $19, %xmm5; \ + pxor %xmm6, %xmm11; \ + movdqa %xmm10, %xmm6; \ + pshufd $0x4e, %xmm10, %xmm10; \ + pxor %xmm5, %xmm11; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm7; \ + psrld $19, %xmm5; \ + pxor %xmm7, %xmm15; \ + movdqa %xmm14, %xmm7; \ + pshufd $0x4e, %xmm14, %xmm14; \ + pxor %xmm5, %xmm15; \ + paddd %xmm3, %xmm4; \ + paddd %xmm11, %xmm6; \ + paddd %xmm15, %xmm7; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + movdqa %xmm6, %xmm5; \ + pslld $18, %xmm6; \ + psrld $14, %xmm5; \ + pxor %xmm6, %xmm8; \ + pshufd $0x39, %xmm11, %xmm11; \ + pxor %xmm5, %xmm8; \ + movdqa %xmm7, %xmm5; \ + pslld $18, %xmm7; \ + psrld $14, %xmm5; \ + pxor %xmm7, %xmm12; \ + pshufd $0x39, %xmm15, %xmm15; \ + pxor %xmm5, %xmm12; \ + + +#define salsa8_core_3way_xmm() \ + salsa8_core_3way_xmm_doubleround(); \ + salsa8_core_3way_xmm_doubleround(); \ + salsa8_core_3way_xmm_doubleround(); \ + salsa8_core_3way_xmm_doubleround(); \ -.macro salsa8_core_3way_xmm - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround - salsa8_core_3way_xmm_doubleround -.endm .p2align 6 scrypt_core_3way_xmm: - scrypt_shuffle %rdi, 0, %rsp, 0 - scrypt_shuffle %rdi, 64, %rsp, 64 - scrypt_shuffle %rdi, 128, %rsp, 128 - scrypt_shuffle %rdi, 192, %rsp, 192 - scrypt_shuffle %rdi, 256, %rsp, 256 - scrypt_shuffle %rdi, 320, %rsp, 320 + scrypt_shuffle(%rdi, 0, %rsp, 0) + scrypt_shuffle(%rdi, 64, %rsp, 64) + scrypt_shuffle(%rdi, 128, %rsp, 128) + scrypt_shuffle(%rdi, 192, %rsp, 192) + scrypt_shuffle(%rdi, 256, %rsp, 256) + scrypt_shuffle(%rdi, 320, %rsp, 320) movdqa 64(%rsp), %xmm0 movdqa 80(%rsp), %xmm1 @@ -2052,7 +1980,7 @@ scrypt_core_3way_xmm_loop1: movdqa %xmm14, 256+32(%rbx) movdqa %xmm15, 256+48(%rbx) - salsa8_core_3way_xmm + salsa8_core_3way_xmm() paddd 0(%rbx), %xmm0 paddd 16(%rbx), %xmm1 paddd 32(%rbx), %xmm2 @@ -2102,7 +2030,7 @@ scrypt_core_3way_xmm_loop1: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm + salsa8_core_3way_xmm() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -2185,7 +2113,7 @@ scrypt_core_3way_xmm_loop2: movdqa %xmm13, 256+16(%rsp) movdqa %xmm14, 256+32(%rsp) movdqa %xmm15, 256+48(%rsp) - salsa8_core_3way_xmm + salsa8_core_3way_xmm() paddd 0(%rsp), %xmm0 paddd 16(%rsp), %xmm1 paddd 32(%rsp), %xmm2 @@ -2247,7 +2175,7 @@ scrypt_core_3way_xmm_loop2: movdqa %xmm13, 256+80(%rsp) movdqa %xmm14, 256+96(%rsp) movdqa %xmm15, 256+112(%rsp) - salsa8_core_3way_xmm + salsa8_core_3way_xmm() paddd 64(%rsp), %xmm0 paddd 80(%rsp), %xmm1 paddd 96(%rsp), %xmm2 @@ -2276,173 +2204,166 @@ scrypt_core_3way_xmm_loop2: subq $1, %rcx ja scrypt_core_3way_xmm_loop2 - scrypt_shuffle %rsp, 0, %rdi, 0 - scrypt_shuffle %rsp, 64, %rdi, 64 - scrypt_shuffle %rsp, 128, %rdi, 128 - scrypt_shuffle %rsp, 192, %rdi, 192 - scrypt_shuffle %rsp, 256, %rdi, 256 - scrypt_shuffle %rsp, 320, %rdi, 320 + scrypt_shuffle(%rsp, 0, %rdi, 0) + scrypt_shuffle(%rsp, 64, %rdi, 64) + scrypt_shuffle(%rsp, 128, %rdi, 128) + scrypt_shuffle(%rsp, 192, %rdi, 192) + scrypt_shuffle(%rsp, 256, %rdi, 256) + scrypt_shuffle(%rsp, 320, %rdi, 320) - scrypt_core_3way_cleanup + scrypt_core_3way_cleanup() ret #if defined(USE_AVX2) -.macro salsa8_core_6way_avx2_doubleround - vpaddd %ymm0, %ymm1, %ymm4 - vpaddd %ymm8, %ymm9, %ymm6 - vpaddd %ymm12, %ymm13, %ymm7 - vpslld $7, %ymm4, %ymm5 - vpsrld $25, %ymm4, %ymm4 - vpxor %ymm5, %ymm3, %ymm3 - vpxor %ymm4, %ymm3, %ymm3 - vpslld $7, %ymm6, %ymm5 - vpsrld $25, %ymm6, %ymm6 - vpxor %ymm5, %ymm11, %ymm11 - vpxor %ymm6, %ymm11, %ymm11 - vpslld $7, %ymm7, %ymm5 - vpsrld $25, %ymm7, %ymm7 - vpxor %ymm5, %ymm15, %ymm15 - vpxor %ymm7, %ymm15, %ymm15 - - vpaddd %ymm3, %ymm0, %ymm4 - vpaddd %ymm11, %ymm8, %ymm6 - vpaddd %ymm15, %ymm12, %ymm7 - vpslld $9, %ymm4, %ymm5 - vpsrld $23, %ymm4, %ymm4 - vpxor %ymm5, %ymm2, %ymm2 - vpxor %ymm4, %ymm2, %ymm2 - vpslld $9, %ymm6, %ymm5 - vpsrld $23, %ymm6, %ymm6 - vpxor %ymm5, %ymm10, %ymm10 - vpxor %ymm6, %ymm10, %ymm10 - vpslld $9, %ymm7, %ymm5 - vpsrld $23, %ymm7, %ymm7 - vpxor %ymm5, %ymm14, %ymm14 - vpxor %ymm7, %ymm14, %ymm14 - - vpaddd %ymm2, %ymm3, %ymm4 - vpaddd %ymm10, %ymm11, %ymm6 - vpaddd %ymm14, %ymm15, %ymm7 - vpslld $13, %ymm4, %ymm5 - vpsrld $19, %ymm4, %ymm4 - vpshufd $0x93, %ymm3, %ymm3 - vpshufd $0x93, %ymm11, %ymm11 - vpshufd $0x93, %ymm15, %ymm15 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm4, %ymm1, %ymm1 - vpslld $13, %ymm6, %ymm5 - vpsrld $19, %ymm6, %ymm6 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm9, %ymm9 - vpslld $13, %ymm7, %ymm5 - vpsrld $19, %ymm7, %ymm7 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm7, %ymm13, %ymm13 - - vpaddd %ymm1, %ymm2, %ymm4 - vpaddd %ymm9, %ymm10, %ymm6 - vpaddd %ymm13, %ymm14, %ymm7 - vpslld $18, %ymm4, %ymm5 - vpsrld $14, %ymm4, %ymm4 - vpshufd $0x4e, %ymm2, %ymm2 - vpshufd $0x4e, %ymm10, %ymm10 - vpshufd $0x4e, %ymm14, %ymm14 - vpxor %ymm5, %ymm0, %ymm0 - vpxor %ymm4, %ymm0, %ymm0 - vpslld $18, %ymm6, %ymm5 - vpsrld $14, %ymm6, %ymm6 - vpxor %ymm5, %ymm8, %ymm8 - vpxor %ymm6, %ymm8, %ymm8 - vpslld $18, %ymm7, %ymm5 - vpsrld $14, %ymm7, %ymm7 - vpxor %ymm5, %ymm12, %ymm12 - vpxor %ymm7, %ymm12, %ymm12 - - vpaddd %ymm0, %ymm3, %ymm4 - vpaddd %ymm8, %ymm11, %ymm6 - vpaddd %ymm12, %ymm15, %ymm7 - vpslld $7, %ymm4, %ymm5 - vpsrld $25, %ymm4, %ymm4 - vpshufd $0x39, %ymm1, %ymm1 - vpxor %ymm5, %ymm1, %ymm1 - vpxor %ymm4, %ymm1, %ymm1 - vpslld $7, %ymm6, %ymm5 - vpsrld $25, %ymm6, %ymm6 - vpshufd $0x39, %ymm9, %ymm9 - vpxor %ymm5, %ymm9, %ymm9 - vpxor %ymm6, %ymm9, %ymm9 - vpslld $7, %ymm7, %ymm5 - vpsrld $25, %ymm7, %ymm7 - vpshufd $0x39, %ymm13, %ymm13 - vpxor %ymm5, %ymm13, %ymm13 - vpxor %ymm7, %ymm13, %ymm13 - - vpaddd %ymm1, %ymm0, %ymm4 - vpaddd %ymm9, %ymm8, %ymm6 - vpaddd %ymm13, %ymm12, %ymm7 - vpslld $9, %ymm4, %ymm5 - vpsrld $23, %ymm4, %ymm4 - vpxor %ymm5, %ymm2, %ymm2 - vpxor %ymm4, %ymm2, %ymm2 - vpslld $9, %ymm6, %ymm5 - vpsrld $23, %ymm6, %ymm6 - vpxor %ymm5, %ymm10, %ymm10 - vpxor %ymm6, %ymm10, %ymm10 - vpslld $9, %ymm7, %ymm5 - vpsrld $23, %ymm7, %ymm7 - vpxor %ymm5, %ymm14, %ymm14 - vpxor %ymm7, %ymm14, %ymm14 - - vpaddd %ymm2, %ymm1, %ymm4 - vpaddd %ymm10, %ymm9, %ymm6 - vpaddd %ymm14, %ymm13, %ymm7 - vpslld $13, %ymm4, %ymm5 - vpsrld $19, %ymm4, %ymm4 - vpshufd $0x93, %ymm1, %ymm1 - vpshufd $0x93, %ymm9, %ymm9 - vpshufd $0x93, %ymm13, %ymm13 - vpxor %ymm5, %ymm3, %ymm3 - vpxor %ymm4, %ymm3, %ymm3 - vpslld $13, %ymm6, %ymm5 - vpsrld $19, %ymm6, %ymm6 - vpxor %ymm5, %ymm11, %ymm11 - vpxor %ymm6, %ymm11, %ymm11 - vpslld $13, %ymm7, %ymm5 - vpsrld $19, %ymm7, %ymm7 - vpxor %ymm5, %ymm15, %ymm15 - vpxor %ymm7, %ymm15, %ymm15 - - vpaddd %ymm3, %ymm2, %ymm4 - vpaddd %ymm11, %ymm10, %ymm6 - vpaddd %ymm15, %ymm14, %ymm7 - vpslld $18, %ymm4, %ymm5 - vpsrld $14, %ymm4, %ymm4 - vpshufd $0x4e, %ymm2, %ymm2 - vpshufd $0x4e, %ymm10, %ymm10 - vpxor %ymm5, %ymm0, %ymm0 - vpxor %ymm4, %ymm0, %ymm0 - vpslld $18, %ymm6, %ymm5 - vpsrld $14, %ymm6, %ymm6 - vpshufd $0x4e, %ymm14, %ymm14 - vpshufd $0x39, %ymm11, %ymm11 - vpxor %ymm5, %ymm8, %ymm8 - vpxor %ymm6, %ymm8, %ymm8 - vpslld $18, %ymm7, %ymm5 - vpsrld $14, %ymm7, %ymm7 - vpshufd $0x39, %ymm3, %ymm3 - vpshufd $0x39, %ymm15, %ymm15 - vpxor %ymm5, %ymm12, %ymm12 - vpxor %ymm7, %ymm12, %ymm12 -.endm +#define salsa8_core_6way_avx2_doubleround() \ + vpaddd %ymm0, %ymm1, %ymm4; \ + vpaddd %ymm8, %ymm9, %ymm6; \ + vpaddd %ymm12, %ymm13, %ymm7; \ + vpslld $7, %ymm4, %ymm5; \ + vpsrld $25, %ymm4, %ymm4; \ + vpxor %ymm5, %ymm3, %ymm3; \ + vpxor %ymm4, %ymm3, %ymm3; \ + vpslld $7, %ymm6, %ymm5; \ + vpsrld $25, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm11, %ymm11; \ + vpxor %ymm6, %ymm11, %ymm11; \ + vpslld $7, %ymm7, %ymm5; \ + vpsrld $25, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm15, %ymm15; \ + vpxor %ymm7, %ymm15, %ymm15; \ + vpaddd %ymm3, %ymm0, %ymm4; \ + vpaddd %ymm11, %ymm8, %ymm6; \ + vpaddd %ymm15, %ymm12, %ymm7; \ + vpslld $9, %ymm4, %ymm5; \ + vpsrld $23, %ymm4, %ymm4; \ + vpxor %ymm5, %ymm2, %ymm2; \ + vpxor %ymm4, %ymm2, %ymm2; \ + vpslld $9, %ymm6, %ymm5; \ + vpsrld $23, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm10, %ymm10; \ + vpxor %ymm6, %ymm10, %ymm10; \ + vpslld $9, %ymm7, %ymm5; \ + vpsrld $23, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm14, %ymm14; \ + vpxor %ymm7, %ymm14, %ymm14; \ + vpaddd %ymm2, %ymm3, %ymm4; \ + vpaddd %ymm10, %ymm11, %ymm6; \ + vpaddd %ymm14, %ymm15, %ymm7; \ + vpslld $13, %ymm4, %ymm5; \ + vpsrld $19, %ymm4, %ymm4; \ + vpshufd $0x93, %ymm3, %ymm3; \ + vpshufd $0x93, %ymm11, %ymm11; \ + vpshufd $0x93, %ymm15, %ymm15; \ + vpxor %ymm5, %ymm1, %ymm1; \ + vpxor %ymm4, %ymm1, %ymm1; \ + vpslld $13, %ymm6, %ymm5; \ + vpsrld $19, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm9, %ymm9; \ + vpxor %ymm6, %ymm9, %ymm9; \ + vpslld $13, %ymm7, %ymm5; \ + vpsrld $19, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm13, %ymm13; \ + vpxor %ymm7, %ymm13, %ymm13; \ + vpaddd %ymm1, %ymm2, %ymm4; \ + vpaddd %ymm9, %ymm10, %ymm6; \ + vpaddd %ymm13, %ymm14, %ymm7; \ + vpslld $18, %ymm4, %ymm5; \ + vpsrld $14, %ymm4, %ymm4; \ + vpshufd $0x4e, %ymm2, %ymm2; \ + vpshufd $0x4e, %ymm10, %ymm10; \ + vpshufd $0x4e, %ymm14, %ymm14; \ + vpxor %ymm5, %ymm0, %ymm0; \ + vpxor %ymm4, %ymm0, %ymm0; \ + vpslld $18, %ymm6, %ymm5; \ + vpsrld $14, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm8, %ymm8; \ + vpxor %ymm6, %ymm8, %ymm8; \ + vpslld $18, %ymm7, %ymm5; \ + vpsrld $14, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm12, %ymm12; \ + vpxor %ymm7, %ymm12, %ymm12; \ + vpaddd %ymm0, %ymm3, %ymm4; \ + vpaddd %ymm8, %ymm11, %ymm6; \ + vpaddd %ymm12, %ymm15, %ymm7; \ + vpslld $7, %ymm4, %ymm5; \ + vpsrld $25, %ymm4, %ymm4; \ + vpshufd $0x39, %ymm1, %ymm1; \ + vpxor %ymm5, %ymm1, %ymm1; \ + vpxor %ymm4, %ymm1, %ymm1; \ + vpslld $7, %ymm6, %ymm5; \ + vpsrld $25, %ymm6, %ymm6; \ + vpshufd $0x39, %ymm9, %ymm9; \ + vpxor %ymm5, %ymm9, %ymm9; \ + vpxor %ymm6, %ymm9, %ymm9; \ + vpslld $7, %ymm7, %ymm5; \ + vpsrld $25, %ymm7, %ymm7; \ + vpshufd $0x39, %ymm13, %ymm13; \ + vpxor %ymm5, %ymm13, %ymm13; \ + vpxor %ymm7, %ymm13, %ymm13; \ + vpaddd %ymm1, %ymm0, %ymm4; \ + vpaddd %ymm9, %ymm8, %ymm6; \ + vpaddd %ymm13, %ymm12, %ymm7; \ + vpslld $9, %ymm4, %ymm5; \ + vpsrld $23, %ymm4, %ymm4; \ + vpxor %ymm5, %ymm2, %ymm2; \ + vpxor %ymm4, %ymm2, %ymm2; \ + vpslld $9, %ymm6, %ymm5; \ + vpsrld $23, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm10, %ymm10; \ + vpxor %ymm6, %ymm10, %ymm10; \ + vpslld $9, %ymm7, %ymm5; \ + vpsrld $23, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm14, %ymm14; \ + vpxor %ymm7, %ymm14, %ymm14; \ + vpaddd %ymm2, %ymm1, %ymm4; \ + vpaddd %ymm10, %ymm9, %ymm6; \ + vpaddd %ymm14, %ymm13, %ymm7; \ + vpslld $13, %ymm4, %ymm5; \ + vpsrld $19, %ymm4, %ymm4; \ + vpshufd $0x93, %ymm1, %ymm1; \ + vpshufd $0x93, %ymm9, %ymm9; \ + vpshufd $0x93, %ymm13, %ymm13; \ + vpxor %ymm5, %ymm3, %ymm3; \ + vpxor %ymm4, %ymm3, %ymm3; \ + vpslld $13, %ymm6, %ymm5; \ + vpsrld $19, %ymm6, %ymm6; \ + vpxor %ymm5, %ymm11, %ymm11; \ + vpxor %ymm6, %ymm11, %ymm11; \ + vpslld $13, %ymm7, %ymm5; \ + vpsrld $19, %ymm7, %ymm7; \ + vpxor %ymm5, %ymm15, %ymm15; \ + vpxor %ymm7, %ymm15, %ymm15; \ + vpaddd %ymm3, %ymm2, %ymm4; \ + vpaddd %ymm11, %ymm10, %ymm6; \ + vpaddd %ymm15, %ymm14, %ymm7; \ + vpslld $18, %ymm4, %ymm5; \ + vpsrld $14, %ymm4, %ymm4; \ + vpshufd $0x4e, %ymm2, %ymm2; \ + vpshufd $0x4e, %ymm10, %ymm10; \ + vpxor %ymm5, %ymm0, %ymm0; \ + vpxor %ymm4, %ymm0, %ymm0; \ + vpslld $18, %ymm6, %ymm5; \ + vpsrld $14, %ymm6, %ymm6; \ + vpshufd $0x4e, %ymm14, %ymm14; \ + vpshufd $0x39, %ymm11, %ymm11; \ + vpxor %ymm5, %ymm8, %ymm8; \ + vpxor %ymm6, %ymm8, %ymm8; \ + vpslld $18, %ymm7, %ymm5; \ + vpsrld $14, %ymm7, %ymm7; \ + vpshufd $0x39, %ymm3, %ymm3; \ + vpshufd $0x39, %ymm15, %ymm15; \ + vpxor %ymm5, %ymm12, %ymm12; \ + vpxor %ymm7, %ymm12, %ymm12; \ + + +#define salsa8_core_6way_avx2() \ + salsa8_core_6way_avx2_doubleround(); \ + salsa8_core_6way_avx2_doubleround(); \ + salsa8_core_6way_avx2_doubleround(); \ + salsa8_core_6way_avx2_doubleround(); \ -.macro salsa8_core_6way_avx2 - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround - salsa8_core_6way_avx2_doubleround -.endm .text .p2align 6 @@ -2475,80 +2396,65 @@ _scrypt_core_6way: subq $768, %rsp andq $-128, %rsp -.macro scrypt_core_6way_cleanup - movq %rdx, %rsp -#if defined(_WIN64) || defined(__CYGWIN__) - popq %rsi - popq %rdi - vmovdqa 8(%rsp), %xmm6 - vmovdqa 24(%rsp), %xmm7 - vmovdqa 40(%rsp), %xmm8 - vmovdqa 56(%rsp), %xmm9 - vmovdqa 72(%rsp), %xmm10 - vmovdqa 88(%rsp), %xmm11 - vmovdqa 104(%rsp), %xmm12 - vmovdqa 120(%rsp), %xmm13 - vmovdqa 136(%rsp), %xmm14 - vmovdqa 152(%rsp), %xmm15 - addq $176, %rsp -#endif - popq %rbp - popq %rbx -.endm +#define scrypt_core_6way_cleanup() \ + movq %rdx, %rsp; \ + popq %rbp; \ + popq %rbx; \ + + +#define scrypt_shuffle_pack2(src, so, dest, do) \ + vmovdqa so+0*16(src), %xmm0; \ + vmovdqa so+1*16(src), %xmm1; \ + vmovdqa so+2*16(src), %xmm2; \ + vmovdqa so+3*16(src), %xmm3; \ + vinserti128 $1, so+128+0*16(src), %ymm0, %ymm0; \ + vinserti128 $1, so+128+1*16(src), %ymm1, %ymm1; \ + vinserti128 $1, so+128+2*16(src), %ymm2, %ymm2; \ + vinserti128 $1, so+128+3*16(src), %ymm3, %ymm3; \ + vpblendd $0x33, %ymm0, %ymm2, %ymm4; \ + vpblendd $0xcc, %ymm1, %ymm3, %ymm5; \ + vpblendd $0x33, %ymm2, %ymm0, %ymm6; \ + vpblendd $0xcc, %ymm3, %ymm1, %ymm7; \ + vpblendd $0x55, %ymm7, %ymm6, %ymm3; \ + vpblendd $0x55, %ymm6, %ymm5, %ymm2; \ + vpblendd $0x55, %ymm5, %ymm4, %ymm1; \ + vpblendd $0x55, %ymm4, %ymm7, %ymm0; \ + vmovdqa %ymm0, do+0*32(dest); \ + vmovdqa %ymm1, do+1*32(dest); \ + vmovdqa %ymm2, do+2*32(dest); \ + vmovdqa %ymm3, do+3*32(dest); \ + -.macro scrypt_shuffle_pack2 src, so, dest, do - vmovdqa \so+0*16(\src), %xmm0 - vmovdqa \so+1*16(\src), %xmm1 - vmovdqa \so+2*16(\src), %xmm2 - vmovdqa \so+3*16(\src), %xmm3 - vinserti128 $1, \so+128+0*16(\src), %ymm0, %ymm0 - vinserti128 $1, \so+128+1*16(\src), %ymm1, %ymm1 - vinserti128 $1, \so+128+2*16(\src), %ymm2, %ymm2 - vinserti128 $1, \so+128+3*16(\src), %ymm3, %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %ymm0, \do+0*32(\dest) - vmovdqa %ymm1, \do+1*32(\dest) - vmovdqa %ymm2, \do+2*32(\dest) - vmovdqa %ymm3, \do+3*32(\dest) -.endm +#define scrypt_shuffle_unpack2(src, so, dest, do) \ + vmovdqa so+0*32(src), %ymm0; \ + vmovdqa so+1*32(src), %ymm1; \ + vmovdqa so+2*32(src), %ymm2; \ + vmovdqa so+3*32(src), %ymm3; \ + vpblendd $0x33, %ymm0, %ymm2, %ymm4; \ + vpblendd $0xcc, %ymm1, %ymm3, %ymm5; \ + vpblendd $0x33, %ymm2, %ymm0, %ymm6; \ + vpblendd $0xcc, %ymm3, %ymm1, %ymm7; \ + vpblendd $0x55, %ymm7, %ymm6, %ymm3; \ + vpblendd $0x55, %ymm6, %ymm5, %ymm2; \ + vpblendd $0x55, %ymm5, %ymm4, %ymm1; \ + vpblendd $0x55, %ymm4, %ymm7, %ymm0; \ + vmovdqa %xmm0, do+0*16(dest); \ + vmovdqa %xmm1, do+1*16(dest); \ + vmovdqa %xmm2, do+2*16(dest); \ + vmovdqa %xmm3, do+3*16(dest); \ + vextracti128 $1, %ymm0, do+128+0*16(dest); \ + vextracti128 $1, %ymm1, do+128+1*16(dest); \ + vextracti128 $1, %ymm2, do+128+2*16(dest); \ + vextracti128 $1, %ymm3, do+128+3*16(dest); \ -.macro scrypt_shuffle_unpack2 src, so, dest, do - vmovdqa \so+0*32(\src), %ymm0 - vmovdqa \so+1*32(\src), %ymm1 - vmovdqa \so+2*32(\src), %ymm2 - vmovdqa \so+3*32(\src), %ymm3 - vpblendd $0x33, %ymm0, %ymm2, %ymm4 - vpblendd $0xcc, %ymm1, %ymm3, %ymm5 - vpblendd $0x33, %ymm2, %ymm0, %ymm6 - vpblendd $0xcc, %ymm3, %ymm1, %ymm7 - vpblendd $0x55, %ymm7, %ymm6, %ymm3 - vpblendd $0x55, %ymm6, %ymm5, %ymm2 - vpblendd $0x55, %ymm5, %ymm4, %ymm1 - vpblendd $0x55, %ymm4, %ymm7, %ymm0 - vmovdqa %xmm0, \do+0*16(\dest) - vmovdqa %xmm1, \do+1*16(\dest) - vmovdqa %xmm2, \do+2*16(\dest) - vmovdqa %xmm3, \do+3*16(\dest) - vextracti128 $1, %ymm0, \do+128+0*16(\dest) - vextracti128 $1, %ymm1, \do+128+1*16(\dest) - vextracti128 $1, %ymm2, \do+128+2*16(\dest) - vextracti128 $1, %ymm3, \do+128+3*16(\dest) -.endm scrypt_core_6way_avx2: - scrypt_shuffle_pack2 %rdi, 0*256+0, %rsp, 0*128 - scrypt_shuffle_pack2 %rdi, 0*256+64, %rsp, 1*128 - scrypt_shuffle_pack2 %rdi, 1*256+0, %rsp, 2*128 - scrypt_shuffle_pack2 %rdi, 1*256+64, %rsp, 3*128 - scrypt_shuffle_pack2 %rdi, 2*256+0, %rsp, 4*128 - scrypt_shuffle_pack2 %rdi, 2*256+64, %rsp, 5*128 + scrypt_shuffle_pack2(%rdi, 0*256+0, %rsp, 0*128) + scrypt_shuffle_pack2(%rdi, 0*256+64, %rsp, 1*128) + scrypt_shuffle_pack2(%rdi, 1*256+0, %rsp, 2*128) + scrypt_shuffle_pack2(%rdi, 1*256+64, %rsp, 3*128) + scrypt_shuffle_pack2(%rdi, 2*256+0, %rsp, 4*128) + scrypt_shuffle_pack2(%rdi, 2*256+64, %rsp, 5*128) vmovdqa 0*256+4*32(%rsp), %ymm0 vmovdqa 0*256+5*32(%rsp), %ymm1 @@ -2605,7 +2511,7 @@ scrypt_core_6way_avx2_loop1: vmovdqa %ymm14, 2*256+2*32(%rbx) vmovdqa %ymm15, 2*256+3*32(%rbx) - salsa8_core_6way_avx2 + salsa8_core_6way_avx2() vpaddd 0*256+0*32(%rbx), %ymm0, %ymm0 vpaddd 0*256+1*32(%rbx), %ymm1, %ymm1 vpaddd 0*256+2*32(%rbx), %ymm2, %ymm2 @@ -2655,7 +2561,7 @@ scrypt_core_6way_avx2_loop1: vmovdqa %ymm13, 2*256+5*32(%rsp) vmovdqa %ymm14, 2*256+6*32(%rsp) vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 + salsa8_core_6way_avx2() vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 @@ -2777,7 +2683,7 @@ scrypt_core_6way_avx2_loop2: vmovdqa %ymm13, 2*256+1*32(%rsp) vmovdqa %ymm14, 2*256+2*32(%rsp) vmovdqa %ymm15, 2*256+3*32(%rsp) - salsa8_core_6way_avx2 + salsa8_core_6way_avx2() vpaddd 0*256+0*32(%rsp), %ymm0, %ymm0 vpaddd 0*256+1*32(%rsp), %ymm1, %ymm1 vpaddd 0*256+2*32(%rsp), %ymm2, %ymm2 @@ -2863,7 +2769,7 @@ scrypt_core_6way_avx2_loop2: vmovdqa %ymm13, 2*256+5*32(%rsp) vmovdqa %ymm14, 2*256+6*32(%rsp) vmovdqa %ymm15, 2*256+7*32(%rsp) - salsa8_core_6way_avx2 + salsa8_core_6way_avx2() vpaddd 0*256+4*32(%rsp), %ymm0, %ymm0 vpaddd 0*256+5*32(%rsp), %ymm1, %ymm1 vpaddd 0*256+6*32(%rsp), %ymm2, %ymm2 @@ -2892,14 +2798,14 @@ scrypt_core_6way_avx2_loop2: subq $1, %rcx ja scrypt_core_6way_avx2_loop2 - scrypt_shuffle_unpack2 %rsp, 0*128, %rdi, 0*256+0 - scrypt_shuffle_unpack2 %rsp, 1*128, %rdi, 0*256+64 - scrypt_shuffle_unpack2 %rsp, 2*128, %rdi, 1*256+0 - scrypt_shuffle_unpack2 %rsp, 3*128, %rdi, 1*256+64 - scrypt_shuffle_unpack2 %rsp, 4*128, %rdi, 2*256+0 - scrypt_shuffle_unpack2 %rsp, 5*128, %rdi, 2*256+64 + scrypt_shuffle_unpack2(%rsp, 0*128, %rdi, 0*256+0) + scrypt_shuffle_unpack2(%rsp, 1*128, %rdi, 0*256+64) + scrypt_shuffle_unpack2(%rsp, 2*128, %rdi, 1*256+0) + scrypt_shuffle_unpack2(%rsp, 3*128, %rdi, 1*256+64) + scrypt_shuffle_unpack2(%rsp, 4*128, %rdi, 2*256+0) + scrypt_shuffle_unpack2(%rsp, 5*128, %rdi, 2*256+64) - scrypt_core_6way_cleanup + scrypt_core_6way_cleanup() ret #endif /* USE_AVX2 */ diff --git a/scrypt-x86.S b/scrypt-x86.S index 5ab7eda65..ff4f8b6ab 100644 --- a/scrypt-x86.S +++ b/scrypt-x86.S @@ -32,366 +32,366 @@ #if defined(USE_ASM) && defined(__i386__) -.macro scrypt_shuffle src, so, dest, do - movl \so+60(\src), %eax - movl \so+44(\src), %ebx - movl \so+28(\src), %ecx - movl \so+12(\src), %edx - movl %eax, \do+12(\dest) - movl %ebx, \do+28(\dest) - movl %ecx, \do+44(\dest) - movl %edx, \do+60(\dest) - movl \so+40(\src), %eax - movl \so+8(\src), %ebx - movl \so+48(\src), %ecx - movl \so+16(\src), %edx - movl %eax, \do+8(\dest) - movl %ebx, \do+40(\dest) - movl %ecx, \do+16(\dest) - movl %edx, \do+48(\dest) - movl \so+20(\src), %eax - movl \so+4(\src), %ebx - movl \so+52(\src), %ecx - movl \so+36(\src), %edx - movl %eax, \do+4(\dest) - movl %ebx, \do+20(\dest) - movl %ecx, \do+36(\dest) - movl %edx, \do+52(\dest) - movl \so+0(\src), %eax - movl \so+24(\src), %ebx - movl \so+32(\src), %ecx - movl \so+56(\src), %edx - movl %eax, \do+0(\dest) - movl %ebx, \do+24(\dest) - movl %ecx, \do+32(\dest) - movl %edx, \do+56(\dest) -.endm +#define scrypt_shuffle(src, so, dest, do) \ + movl so+60(src), %eax; \ + movl so+44(src), %ebx; \ + movl so+28(src), %ecx; \ + movl so+12(src), %edx; \ + movl %eax, do+12(dest); \ + movl %ebx, do+28(dest); \ + movl %ecx, do+44(dest); \ + movl %edx, do+60(dest); \ + movl so+40(src), %eax; \ + movl so+8(src), %ebx; \ + movl so+48(src), %ecx; \ + movl so+16(src), %edx; \ + movl %eax, do+8(dest); \ + movl %ebx, do+40(dest); \ + movl %ecx, do+16(dest); \ + movl %edx, do+48(dest); \ + movl so+20(src), %eax; \ + movl so+4(src), %ebx; \ + movl so+52(src), %ecx; \ + movl so+36(src), %edx; \ + movl %eax, do+4(dest); \ + movl %ebx, do+20(dest); \ + movl %ecx, do+36(dest); \ + movl %edx, do+52(dest); \ + movl so+0(src), %eax; \ + movl so+24(src), %ebx; \ + movl so+32(src), %ecx; \ + movl so+56(src), %edx; \ + movl %eax, do+0(dest); \ + movl %ebx, do+24(dest); \ + movl %ecx, do+32(dest); \ + movl %edx, do+56(dest); \ + + +#define salsa8_core_gen_quadround() \ + movl 52(%esp), %ecx; \ + movl 4(%esp), %edx; \ + movl 20(%esp), %ebx; \ + movl 8(%esp), %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 4(%esp); \ + movl 36(%esp), %edi; \ + leal (%edx, %ebx), %ebp; \ + roll $9, %ebp; \ + xorl %ebp, %edi; \ + movl 24(%esp), %ebp; \ + movl %edi, 8(%esp); \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 40(%esp), %ebx; \ + movl %ecx, 20(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 24(%esp); \ + movl 56(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 36(%esp); \ + movl 28(%esp), %ecx; \ + movl %edx, 28(%esp); \ + movl 44(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 60(%esp), %ebx; \ + movl %esi, 40(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 44(%esp); \ + movl 12(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 12(%esp); \ + movl 48(%esp), %esi; \ + movl %ebp, 48(%esp); \ + movl 64(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl 32(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 32(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 52(%esp); \ + movl 28(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 40(%esp), %ebx; \ + movl %esi, 28(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 40(%esp); \ + movl 12(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 12(%esp); \ + movl 4(%esp), %esi; \ + movl %ebp, 4(%esp); \ + movl 48(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 48(%esp); \ + movl 32(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 32(%esp); \ + movl 24(%esp), %ecx; \ + movl %edx, 24(%esp); \ + movl 52(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 28(%esp), %ebx; \ + movl %esi, 28(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 52(%esp); \ + movl 8(%esp), %edi; \ + xorl %esi, %ebp; \ + leal (%edx, %ebx), %esi; \ + roll $9, %esi; \ + xorl %esi, %edi; \ + movl %edi, 8(%esp); \ + movl 44(%esp), %esi; \ + movl %ebp, 44(%esp); \ + movl 4(%esp), %ebp; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 20(%esp), %ebx; \ + movl %ecx, 4(%esp); \ + addl %edi, %ecx; \ + roll $18, %ecx; \ + leal (%esi, %ebp), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl 36(%esp), %edi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %edi; \ + movl %edi, 20(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 36(%esp); \ + movl 24(%esp), %edx; \ + addl %edi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %esi; \ + movl 28(%esp), %ebx; \ + movl %esi, 24(%esp); \ + addl %edi, %esi; \ + roll $18, %esi; \ + leal (%ecx, %edx), %edi; \ + roll $7, %edi; \ + xorl %edi, %ebx; \ + movl %ebx, 28(%esp); \ + xorl %esi, %ebp; \ + movl 8(%esp), %esi; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl 40(%esp), %edi; \ + movl %ebp, 8(%esp); \ + movl 44(%esp), %ebp; \ + movl %esi, 40(%esp); \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 4(%esp), %ebx; \ + movl %ecx, 44(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 4(%esp); \ + movl 20(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 56(%esp); \ + movl 48(%esp), %ecx; \ + movl %edx, 20(%esp); \ + movl 36(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 24(%esp), %ebx; \ + movl %edi, 24(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 60(%esp); \ + movl 12(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 12(%esp); \ + movl 52(%esp), %edi; \ + movl %ebp, 36(%esp); \ + movl 8(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl 32(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 32(%esp); \ + movl %ebx, %ecx; \ + movl %edx, 48(%esp); \ + movl 20(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 24(%esp), %ebx; \ + movl %edi, 20(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 8(%esp); \ + movl 12(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 12(%esp); \ + movl 28(%esp), %edi; \ + movl %ebp, 52(%esp); \ + movl 36(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 16(%esp), %ebx; \ + movl %ecx, 16(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 28(%esp); \ + movl 32(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 32(%esp); \ + movl 4(%esp), %ecx; \ + movl %edx, 4(%esp); \ + movl 48(%esp), %edx; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl 20(%esp), %ebx; \ + movl %edi, 20(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + leal (%ecx, %edx), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 48(%esp); \ + movl 40(%esp), %esi; \ + xorl %edi, %ebp; \ + leal (%edx, %ebx), %edi; \ + roll $9, %edi; \ + xorl %edi, %esi; \ + movl %esi, 36(%esp); \ + movl 60(%esp), %edi; \ + movl %ebp, 24(%esp); \ + movl 52(%esp), %ebp; \ + addl %esi, %ebx; \ + roll $13, %ebx; \ + xorl %ebx, %ecx; \ + movl 44(%esp), %ebx; \ + movl %ecx, 40(%esp); \ + addl %esi, %ecx; \ + roll $18, %ecx; \ + leal (%edi, %ebp), %esi; \ + roll $7, %esi; \ + xorl %esi, %ebx; \ + movl %ebx, 52(%esp); \ + movl 56(%esp), %esi; \ + xorl %ecx, %edx; \ + leal (%ebp, %ebx), %ecx; \ + roll $9, %ecx; \ + xorl %ecx, %esi; \ + movl %esi, 56(%esp); \ + addl %esi, %ebx; \ + movl %edx, 44(%esp); \ + roll $13, %ebx; \ + xorl %ebx, %edi; \ + movl %edi, 60(%esp); \ + addl %esi, %edi; \ + roll $18, %edi; \ + xorl %edi, %ebp; \ + movl %ebp, 64(%esp); \ -.macro salsa8_core_gen_quadround - movl 52(%esp), %ecx - movl 4(%esp), %edx - movl 20(%esp), %ebx - movl 8(%esp), %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 4(%esp) - movl 36(%esp), %edi - leal (%edx, %ebx), %ebp - roll $9, %ebp - xorl %ebp, %edi - movl 24(%esp), %ebp - movl %edi, 8(%esp) - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 40(%esp), %ebx - movl %ecx, 20(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 24(%esp) - movl 56(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 36(%esp) - movl 28(%esp), %ecx - movl %edx, 28(%esp) - movl 44(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 60(%esp), %ebx - movl %esi, 40(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 44(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 48(%esp), %esi - movl %ebp, 48(%esp) - movl 64(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl %ebx, %ecx - movl %edx, 52(%esp) - movl 28(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 40(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 40(%esp) - movl 12(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 12(%esp) - movl 4(%esp), %esi - movl %ebp, 4(%esp) - movl 48(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 48(%esp) - movl 32(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 32(%esp) - movl 24(%esp), %ecx - movl %edx, 24(%esp) - movl 52(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 28(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 52(%esp) - movl 8(%esp), %edi - xorl %esi, %ebp - leal (%edx, %ebx), %esi - roll $9, %esi - xorl %esi, %edi - movl %edi, 8(%esp) - movl 44(%esp), %esi - movl %ebp, 44(%esp) - movl 4(%esp), %ebp - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 20(%esp), %ebx - movl %ecx, 4(%esp) - addl %edi, %ecx - roll $18, %ecx - leal (%esi, %ebp), %edi - roll $7, %edi - xorl %edi, %ebx - movl 36(%esp), %edi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %edi - movl %edi, 20(%esp) - movl %ebx, %ecx - movl %edx, 36(%esp) - movl 24(%esp), %edx - addl %edi, %ebx - roll $13, %ebx - xorl %ebx, %esi - movl 28(%esp), %ebx - movl %esi, 24(%esp) - addl %edi, %esi - roll $18, %esi - leal (%ecx, %edx), %edi - roll $7, %edi - xorl %edi, %ebx - movl %ebx, 28(%esp) - xorl %esi, %ebp - movl 8(%esp), %esi - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl 40(%esp), %edi - movl %ebp, 8(%esp) - movl 44(%esp), %ebp - movl %esi, 40(%esp) - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 4(%esp), %ebx - movl %ecx, 44(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 4(%esp) - movl 20(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - movl 48(%esp), %ecx - movl %edx, 20(%esp) - movl 36(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 24(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 60(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 52(%esp), %edi - movl %ebp, 36(%esp) - movl 8(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl %ebx, %ecx - movl %edx, 48(%esp) - movl 20(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 24(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 8(%esp) - movl 12(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 12(%esp) - movl 28(%esp), %edi - movl %ebp, 52(%esp) - movl 36(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 16(%esp), %ebx - movl %ecx, 16(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 28(%esp) - movl 32(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 32(%esp) - movl 4(%esp), %ecx - movl %edx, 4(%esp) - movl 48(%esp), %edx - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %edi - movl 20(%esp), %ebx - movl %edi, 20(%esp) - addl %esi, %edi - roll $18, %edi - leal (%ecx, %edx), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 48(%esp) - movl 40(%esp), %esi - xorl %edi, %ebp - leal (%edx, %ebx), %edi - roll $9, %edi - xorl %edi, %esi - movl %esi, 36(%esp) - movl 60(%esp), %edi - movl %ebp, 24(%esp) - movl 52(%esp), %ebp - addl %esi, %ebx - roll $13, %ebx - xorl %ebx, %ecx - movl 44(%esp), %ebx - movl %ecx, 40(%esp) - addl %esi, %ecx - roll $18, %ecx - leal (%edi, %ebp), %esi - roll $7, %esi - xorl %esi, %ebx - movl %ebx, 52(%esp) - movl 56(%esp), %esi - xorl %ecx, %edx - leal (%ebp, %ebx), %ecx - roll $9, %ecx - xorl %ecx, %esi - movl %esi, 56(%esp) - addl %esi, %ebx - movl %edx, 44(%esp) - roll $13, %ebx - xorl %ebx, %edi - movl %edi, 60(%esp) - addl %esi, %edi - roll $18, %edi - xorl %edi, %ebp - movl %ebp, 64(%esp) -.endm .text .p2align 5 salsa8_core_gen: - salsa8_core_gen_quadround - salsa8_core_gen_quadround + salsa8_core_gen_quadround() + salsa8_core_gen_quadround() ret @@ -418,41 +418,41 @@ scrypt_core_gen: movl 28(%esp), %ecx subl $72, %esp -.macro scrypt_core_macro1a p, q - movl \p(%edi), %eax - movl \q(%edi), %edx - movl %eax, \p(%esi) - movl %edx, \q(%esi) - xorl %edx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro1b p, q - movl \p(%edi), %eax - xorl \p(%esi, %edx), %eax - movl \q(%edi), %ebx - xorl \q(%esi, %edx), %ebx - movl %ebx, \q(%edi) - xorl %ebx, %eax - movl %eax, \p(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro2 p, q - movl \p(%esp), %eax - addl \p(%edi), %eax - movl %eax, \p(%edi) - xorl \q(%edi), %eax - movl %eax, \q(%edi) - movl %eax, \p(%esp) -.endm - -.macro scrypt_core_macro3 p, q - movl \p(%esp), %eax - addl \q(%edi), %eax - movl %eax, \q(%edi) -.endm +#define scrypt_core_macro1a(p, q) \ + movl p(%edi), %eax; \ + movl q(%edi), %edx; \ + movl %eax, p(%esi); \ + movl %edx, q(%esi); \ + xorl %edx, %eax; \ + movl %eax, p(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro1b(p, q) \ + movl p(%edi), %eax; \ + xorl p(%esi, %edx), %eax; \ + movl q(%edi), %ebx; \ + xorl q(%esi, %edx), %ebx; \ + movl %ebx, q(%edi); \ + xorl %ebx, %eax; \ + movl %eax, p(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro2(p, q) \ + movl p(%esp), %eax; \ + addl p(%edi), %eax; \ + movl %eax, p(%edi); \ + xorl q(%edi), %eax; \ + movl %eax, q(%edi); \ + movl %eax, p(%esp); \ + + +#define scrypt_core_macro3(p, q) \ + movl p(%esp), %eax; \ + addl q(%edi), %eax; \ + movl %eax, q(%edi); \ + shll $7, %ecx addl %esi, %ecx @@ -460,62 +460,62 @@ scrypt_core_gen_loop1: movl %esi, 64(%esp) movl %ecx, 68(%esp) - scrypt_core_macro1a 0, 64 - scrypt_core_macro1a 4, 68 - scrypt_core_macro1a 8, 72 - scrypt_core_macro1a 12, 76 - scrypt_core_macro1a 16, 80 - scrypt_core_macro1a 20, 84 - scrypt_core_macro1a 24, 88 - scrypt_core_macro1a 28, 92 - scrypt_core_macro1a 32, 96 - scrypt_core_macro1a 36, 100 - scrypt_core_macro1a 40, 104 - scrypt_core_macro1a 44, 108 - scrypt_core_macro1a 48, 112 - scrypt_core_macro1a 52, 116 - scrypt_core_macro1a 56, 120 - scrypt_core_macro1a 60, 124 + scrypt_core_macro1a(0, 64) + scrypt_core_macro1a(4, 68) + scrypt_core_macro1a(8, 72) + scrypt_core_macro1a(12, 76) + scrypt_core_macro1a(16, 80) + scrypt_core_macro1a(20, 84) + scrypt_core_macro1a(24, 88) + scrypt_core_macro1a(28, 92) + scrypt_core_macro1a(32, 96) + scrypt_core_macro1a(36, 100) + scrypt_core_macro1a(40, 104) + scrypt_core_macro1a(44, 108) + scrypt_core_macro1a(48, 112) + scrypt_core_macro1a(52, 116) + scrypt_core_macro1a(56, 120) + scrypt_core_macro1a(60, 124) call salsa8_core_gen movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 + scrypt_core_macro2(0, 64) + scrypt_core_macro2(4, 68) + scrypt_core_macro2(8, 72) + scrypt_core_macro2(12, 76) + scrypt_core_macro2(16, 80) + scrypt_core_macro2(20, 84) + scrypt_core_macro2(24, 88) + scrypt_core_macro2(28, 92) + scrypt_core_macro2(32, 96) + scrypt_core_macro2(36, 100) + scrypt_core_macro2(40, 104) + scrypt_core_macro2(44, 108) + scrypt_core_macro2(48, 112) + scrypt_core_macro2(52, 116) + scrypt_core_macro2(56, 120) + scrypt_core_macro2(60, 124) call salsa8_core_gen movl 92(%esp), %edi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 + scrypt_core_macro3(0, 64) + scrypt_core_macro3(4, 68) + scrypt_core_macro3(8, 72) + scrypt_core_macro3(12, 76) + scrypt_core_macro3(16, 80) + scrypt_core_macro3(20, 84) + scrypt_core_macro3(24, 88) + scrypt_core_macro3(28, 92) + scrypt_core_macro3(32, 96) + scrypt_core_macro3(36, 100) + scrypt_core_macro3(40, 104) + scrypt_core_macro3(44, 108) + scrypt_core_macro3(48, 112) + scrypt_core_macro3(52, 116) + scrypt_core_macro3(56, 120) + scrypt_core_macro3(60, 124) movl 64(%esp), %esi movl 68(%esp), %ecx @@ -535,63 +535,63 @@ scrypt_core_gen_loop2: andl 100(%esp), %edx shll $7, %edx - scrypt_core_macro1b 0, 64 - scrypt_core_macro1b 4, 68 - scrypt_core_macro1b 8, 72 - scrypt_core_macro1b 12, 76 - scrypt_core_macro1b 16, 80 - scrypt_core_macro1b 20, 84 - scrypt_core_macro1b 24, 88 - scrypt_core_macro1b 28, 92 - scrypt_core_macro1b 32, 96 - scrypt_core_macro1b 36, 100 - scrypt_core_macro1b 40, 104 - scrypt_core_macro1b 44, 108 - scrypt_core_macro1b 48, 112 - scrypt_core_macro1b 52, 116 - scrypt_core_macro1b 56, 120 - scrypt_core_macro1b 60, 124 + scrypt_core_macro1b(0, 64) + scrypt_core_macro1b(4, 68) + scrypt_core_macro1b(8, 72) + scrypt_core_macro1b(12, 76) + scrypt_core_macro1b(16, 80) + scrypt_core_macro1b(20, 84) + scrypt_core_macro1b(24, 88) + scrypt_core_macro1b(28, 92) + scrypt_core_macro1b(32, 96) + scrypt_core_macro1b(36, 100) + scrypt_core_macro1b(40, 104) + scrypt_core_macro1b(44, 108) + scrypt_core_macro1b(48, 112) + scrypt_core_macro1b(52, 116) + scrypt_core_macro1b(56, 120) + scrypt_core_macro1b(60, 124) call salsa8_core_gen movl 92(%esp), %edi - scrypt_core_macro2 0, 64 - scrypt_core_macro2 4, 68 - scrypt_core_macro2 8, 72 - scrypt_core_macro2 12, 76 - scrypt_core_macro2 16, 80 - scrypt_core_macro2 20, 84 - scrypt_core_macro2 24, 88 - scrypt_core_macro2 28, 92 - scrypt_core_macro2 32, 96 - scrypt_core_macro2 36, 100 - scrypt_core_macro2 40, 104 - scrypt_core_macro2 44, 108 - scrypt_core_macro2 48, 112 - scrypt_core_macro2 52, 116 - scrypt_core_macro2 56, 120 - scrypt_core_macro2 60, 124 + scrypt_core_macro2(0, 64) + scrypt_core_macro2(4, 68) + scrypt_core_macro2(8, 72) + scrypt_core_macro2(12, 76) + scrypt_core_macro2(16, 80) + scrypt_core_macro2(20, 84) + scrypt_core_macro2(24, 88) + scrypt_core_macro2(28, 92) + scrypt_core_macro2(32, 96) + scrypt_core_macro2(36, 100) + scrypt_core_macro2(40, 104) + scrypt_core_macro2(44, 108) + scrypt_core_macro2(48, 112) + scrypt_core_macro2(52, 116) + scrypt_core_macro2(56, 120) + scrypt_core_macro2(60, 124) call salsa8_core_gen movl 92(%esp), %edi movl 96(%esp), %esi - scrypt_core_macro3 0, 64 - scrypt_core_macro3 4, 68 - scrypt_core_macro3 8, 72 - scrypt_core_macro3 12, 76 - scrypt_core_macro3 16, 80 - scrypt_core_macro3 20, 84 - scrypt_core_macro3 24, 88 - scrypt_core_macro3 28, 92 - scrypt_core_macro3 32, 96 - scrypt_core_macro3 36, 100 - scrypt_core_macro3 40, 104 - scrypt_core_macro3 44, 108 - scrypt_core_macro3 48, 112 - scrypt_core_macro3 52, 116 - scrypt_core_macro3 56, 120 - scrypt_core_macro3 60, 124 + scrypt_core_macro3(0, 64) + scrypt_core_macro3(4, 68) + scrypt_core_macro3(8, 72) + scrypt_core_macro3(12, 76) + scrypt_core_macro3(16, 80) + scrypt_core_macro3(20, 84) + scrypt_core_macro3(24, 88) + scrypt_core_macro3(28, 92) + scrypt_core_macro3(32, 96) + scrypt_core_macro3(36, 100) + scrypt_core_macro3(40, 104) + scrypt_core_macro3(44, 108) + scrypt_core_macro3(48, 112) + scrypt_core_macro3(52, 116) + scrypt_core_macro3(56, 120) + scrypt_core_macro3(60, 124) movl 68(%esp), %ecx subl $1, %ecx @@ -605,84 +605,77 @@ scrypt_core_gen_loop2: ret -.macro salsa8_core_sse2_doubleround - movdqa %xmm1, %xmm4 - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm3 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm3, %xmm3 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm1 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - movdqa %xmm3, %xmm4 - pxor %xmm5, %xmm0 - pshufd $0x39, %xmm1, %xmm1 - - paddd %xmm0, %xmm4 - movdqa %xmm4, %xmm5 - pslld $7, %xmm4 - psrld $25, %xmm5 - pxor %xmm4, %xmm1 - movdqa %xmm0, %xmm4 - pxor %xmm5, %xmm1 - - paddd %xmm1, %xmm4 - movdqa %xmm4, %xmm5 - pslld $9, %xmm4 - psrld $23, %xmm5 - pxor %xmm4, %xmm2 - movdqa %xmm1, %xmm4 - pxor %xmm5, %xmm2 - pshufd $0x93, %xmm1, %xmm1 - - paddd %xmm2, %xmm4 - movdqa %xmm4, %xmm5 - pslld $13, %xmm4 - psrld $19, %xmm5 - pxor %xmm4, %xmm3 - movdqa %xmm2, %xmm4 - pxor %xmm5, %xmm3 - pshufd $0x4e, %xmm2, %xmm2 - - paddd %xmm3, %xmm4 - movdqa %xmm4, %xmm5 - pslld $18, %xmm4 - psrld $14, %xmm5 - pxor %xmm4, %xmm0 - pshufd $0x39, %xmm3, %xmm3 - pxor %xmm5, %xmm0 -.endm +#define salsa8_core_sse2_doubleround() \ + movdqa %xmm1, %xmm4; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm0, %xmm4; \ + pxor %xmm5, %xmm3; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm3, %xmm4; \ + pxor %xmm5, %xmm2; \ + pshufd $0x93, %xmm3, %xmm3; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm2, %xmm4; \ + pxor %xmm5, %xmm1; \ + pshufd $0x4e, %xmm2, %xmm2; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + movdqa %xmm3, %xmm4; \ + pxor %xmm5, %xmm0; \ + pshufd $0x39, %xmm1, %xmm1; \ + paddd %xmm0, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $7, %xmm4; \ + psrld $25, %xmm5; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm0, %xmm4; \ + pxor %xmm5, %xmm1; \ + paddd %xmm1, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $9, %xmm4; \ + psrld $23, %xmm5; \ + pxor %xmm4, %xmm2; \ + movdqa %xmm1, %xmm4; \ + pxor %xmm5, %xmm2; \ + pshufd $0x93, %xmm1, %xmm1; \ + paddd %xmm2, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $13, %xmm4; \ + psrld $19, %xmm5; \ + pxor %xmm4, %xmm3; \ + movdqa %xmm2, %xmm4; \ + pxor %xmm5, %xmm3; \ + pshufd $0x4e, %xmm2, %xmm2; \ + paddd %xmm3, %xmm4; \ + movdqa %xmm4, %xmm5; \ + pslld $18, %xmm4; \ + psrld $14, %xmm5; \ + pxor %xmm4, %xmm0; \ + pshufd $0x39, %xmm3, %xmm3; \ + pxor %xmm5, %xmm0; \ + + +#define salsa8_core_sse2() \ + salsa8_core_sse2_doubleround(); \ + salsa8_core_sse2_doubleround(); \ + salsa8_core_sse2_doubleround(); \ + salsa8_core_sse2_doubleround(); \ -.macro salsa8_core_sse2 - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround - salsa8_core_sse2_doubleround -.endm .p2align 5 scrypt_core_sse2: @@ -692,8 +685,8 @@ scrypt_core_sse2: subl $128, %esp andl $-16, %esp - scrypt_shuffle %edi, 0, %esp, 0 - scrypt_shuffle %edi, 64, %esp, 64 + scrypt_shuffle(%edi, 0, %esp, 0) + scrypt_shuffle(%edi, 64, %esp, 64) movdqa 96(%esp), %xmm6 movdqa 112(%esp), %xmm7 @@ -722,7 +715,7 @@ scrypt_core_sse2_loop1: movdqa %xmm6, 96(%edx) movdqa %xmm7, 112(%edx) - salsa8_core_sse2 + salsa8_core_sse2() paddd 0(%edx), %xmm0 paddd 16(%edx), %xmm1 paddd 32(%edx), %xmm2 @@ -740,7 +733,7 @@ scrypt_core_sse2_loop1: movdqa %xmm1, 80(%esp) movdqa %xmm2, %xmm6 movdqa %xmm3, %xmm7 - salsa8_core_sse2 + salsa8_core_sse2() paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd %xmm2, %xmm6 @@ -779,7 +772,7 @@ scrypt_core_sse2_loop2: pxor %xmm7, %xmm3 movdqa %xmm2, 32(%esp) movdqa %xmm3, 48(%esp) - salsa8_core_sse2 + salsa8_core_sse2() paddd 0(%esp), %xmm0 paddd 16(%esp), %xmm1 paddd 32(%esp), %xmm2 @@ -801,7 +794,7 @@ scrypt_core_sse2_loop2: movdqa %xmm1, 80(%esp) movdqa %xmm2, %xmm6 movdqa %xmm3, %xmm7 - salsa8_core_sse2 + salsa8_core_sse2() paddd 64(%esp), %xmm0 paddd 80(%esp), %xmm1 paddd %xmm2, %xmm6 @@ -817,8 +810,8 @@ scrypt_core_sse2_loop2: movdqa %xmm6, 96(%esp) movdqa %xmm7, 112(%esp) - scrypt_shuffle %esp, 0, %edi, 0 - scrypt_shuffle %esp, 64, %edi, 64 + scrypt_shuffle(%esp, 0, %edi, 0) + scrypt_shuffle(%esp, 64, %edi, 64) movl %ebp, %esp popl %esi diff --git a/sha2-arm.S b/sha2-arm.S index bd7fdc5cb..0c824b366 100644 --- a/sha2-arm.S +++ b/sha2-arm.S @@ -11,98 +11,96 @@ #if defined(USE_ASM) && defined(__arm__) && defined(__APCS_32__) -.macro sha256_k - .align 2 - .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 - .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 - .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 - .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 - .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc - .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da - .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 - .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 - .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 - .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 - .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 - .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 - .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 - .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 - .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 - .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -.endm +#define sha256_k() \ + .align 2; \ + .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5; \ + .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5; \ + .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3; \ + .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174; \ + .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc; \ + .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da; \ + .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7; \ + .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967; \ + .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13; \ + .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85; \ + .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3; \ + .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070; \ + .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5; \ + .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3; \ + .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208; \ + .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2; \ -.macro sha256_extend_doubleround_core i, rw, ra, rb, ry, rz - mov r12, \ry, ror #17 - add r11, r11, \ra - eor r12, r12, \ry, ror #19 - mov \ra, lr, ror #7 - eor r12, r12, \ry, lsr #10 - eor \ra, \ra, lr, ror #18 - add r12, r12, r11 - ldr r11, [\rw, #(\i+2)*4] - eor \ra, \ra, lr, lsr #3 - add \ra, \ra, r12 - mov r12, \rz, ror #17 - str \ra, [\rw, #(\i+16)*4] - add lr, lr, \rb - eor r12, r12, \rz, ror #19 - mov \rb, r11, ror #7 - eor r12, r12, \rz, lsr #10 - eor \rb, \rb, r11, ror #18 - add lr, lr, r12 - eor \rb, \rb, r11, lsr #3 - add \rb, \rb, lr -.endm +#define sha256_extend_doubleround_core(i, rw, ra, rb, ry, rz) \ + mov r12, ry, ror #17; \ + add r11, r11, ra; \ + eor r12, r12, ry, ror #19; \ + mov ra, lr, ror #7; \ + eor r12, r12, ry, lsr #10; \ + eor ra, ra, lr, ror #18; \ + add r12, r12, r11; \ + ldr r11, [rw, #(i+2)*4]; \ + eor ra, ra, lr, lsr #3; \ + add ra, ra, r12; \ + mov r12, rz, ror #17; \ + str ra, [rw, #(i+16)*4]; \ + add lr, lr, rb; \ + eor r12, r12, rz, ror #19; \ + mov rb, r11, ror #7; \ + eor r12, r12, rz, lsr #10; \ + eor rb, rb, r11, ror #18; \ + add lr, lr, r12; \ + eor rb, rb, r11, lsr #3; \ + add rb, rb, lr; \ + + +#define sha256_extend_doubleround_head(i, rw, ra, rb, ry, rz) \ + ldr lr, [rw, #(i+1)*4]; \ + sha256_extend_doubleround_core(i, rw, ra, rb, ry, rz); \ + ldr lr, [rw, #(i+3)*4]; \ -.macro sha256_extend_doubleround_head i, rw, ra, rb, ry, rz - ldr lr, [\rw, #(\i+1)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm -.macro sha256_extend_doubleround_body i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - ldr lr, [\rw, #(\i+3)*4] -.endm +#define sha256_extend_doubleround_body(i, rw, ra, rb, ry, rz) \ + str rz, [rw, #(i+15)*4]; \ + sha256_extend_doubleround_core(i, rw, ra, rb, ry, rz); \ + ldr lr, [rw, #(i+3)*4]; \ -.macro sha256_extend_doubleround_foot i, rw, ra, rb, ry, rz - str \rz, [\rw, #(\i+15)*4] - sha256_extend_doubleround_core \i, \rw, \ra, \rb, \ry, \rz - str \rb, [\rw, #(\i+17)*4] -.endm -.macro sha256_main_round i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - eor r3, \ra, \rb - add \rh, \rh, lr, ror #6 +#define sha256_extend_doubleround_foot(i, rw, ra, rb, ry, rz) \ + str rz, [rw, #(i+15)*4]; \ + sha256_extend_doubleround_core(i, rw, ra, rb, ry, rz); \ + str rb, [rw, #(i+17)*4]; \ - and r3, r3, \rc - eor r12, \ra, \ra, ror #11 - and lr, \ra, \rb - eor r12, r12, \ra, ror #20 - eor lr, lr, r3 - add r3, \rh, lr - add \rh, \rh, \rd - add \rd, r3, r12, ror #2 -.endm -.macro sha256_main_quadround i, ka, rw - sha256_main_round \i+0, \ka, \rw, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round \i+1, \ka, \rw, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round \i+2, \ka, \rw, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round \i+3, \ka, \rw, r5, r6, r7, r4, r9, r10, r11, r8 -.endm +#define sha256_main_round(i, ka, rw, ra, rb, rc, rd, re, rf, rg, rh) \ + ldr r12, [rw, #(i)*4]; \ + and r3, rf, re; \ + bic lr, rg, re; \ + orr lr, lr, r3; \ + ldr r3, ka + (i)*4; \ + add rh, rh, lr; \ + eor lr, re, re, ror #5; \ + add rh, rh, r12; \ + eor lr, lr, re, ror #19; \ + add rh, rh, r3; \ + eor r3, ra, rb; \ + add rh, rh, lr, ror #6; \ + and r3, r3, rc; \ + eor r12, ra, ra, ror #11; \ + and lr, ra, rb; \ + eor r12, r12, ra, ror #20; \ + eor lr, lr, r3; \ + add r3, rh, lr; \ + add rh, rh, rd; \ + add rd, r3, r12, ror #2; \ + + +#define sha256_main_quadround(i, ka, rw) \ + sha256_main_round(i+0, ka, rw, r4, r5, r6, r7, r8, r9, r10, r11); \ + sha256_main_round(i+1, ka, rw, r7, r4, r5, r6, r11, r8, r9, r10); \ + sha256_main_round(i+2, ka, rw, r6, r7, r4, r5, r10, r11, r8, r9); \ + sha256_main_round(i+3, ka, rw, r5, r6, r7, r4, r9, r10, r11, r8); \ + .text @@ -127,86 +125,86 @@ _sha256_transform: stmia r3, {r4-r11} b sha256_transform_extend -.macro bswap rd, rn - eor r12, \rn, \rn, ror #16 - bic r12, r12, #0x00ff0000 - mov \rd, \rn, ror #8 - eor \rd, \rd, r12, lsr #8 -.endm +#define bswap(rd, rn) \ + eor r12, rn, rn, ror #16; \ + bic r12, r12, #0x00ff0000; \ + mov rd, rn, ror #8; \ + eor rd, rd, r12, lsr #8; \ + sha256_transform_swap: ldmia r1!, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 + bswap(r4, r4) + bswap(r5, r5) + bswap(r6, r6) + bswap(r7, r7) + bswap(r8, r8) + bswap(r9, r9) + bswap(r10, r10) + bswap(r11, r11) stmia sp, {r4-r11} add r3, sp, #8*4 ldmia r1, {r4-r11} - bswap r4, r4 - bswap r5, r5 - bswap r6, r6 - bswap r7, r7 - bswap r8, r8 - bswap r9, r9 - bswap r10, r10 - bswap r11, r11 + bswap(r4, r4) + bswap(r5, r5) + bswap(r6, r6) + bswap(r7, r7) + bswap(r8, r8) + bswap(r9, r9) + bswap(r10, r10) + bswap(r11, r11) stmia r3, {r4-r11} sha256_transform_extend: add r12, sp, #9*4 ldr r11, [sp, #0*4] ldmia r12, {r4-r10} - sha256_extend_doubleround_head 0, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 2, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 4, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 6, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 8, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 10, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 12, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 14, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 16, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, sp, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, sp, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, sp, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, sp, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, sp, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, sp, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, sp, r4, r5, r9, r10 - sha256_extend_doubleround_body 44, sp, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, sp, r8, r9, r6, r7 + sha256_extend_doubleround_head(0, sp, r4, r5, r9, r10) + sha256_extend_doubleround_body(2, sp, r6, r7, r4, r5) + sha256_extend_doubleround_body(4, sp, r8, r9, r6, r7) + sha256_extend_doubleround_body(6, sp, r10, r4, r8, r9) + sha256_extend_doubleround_body(8, sp, r5, r6, r10, r4) + sha256_extend_doubleround_body(10, sp, r7, r8, r5, r6) + sha256_extend_doubleround_body(12, sp, r9, r10, r7, r8) + sha256_extend_doubleround_body(14, sp, r4, r5, r9, r10) + sha256_extend_doubleround_body(16, sp, r6, r7, r4, r5) + sha256_extend_doubleround_body(18, sp, r8, r9, r6, r7) + sha256_extend_doubleround_body(20, sp, r10, r4, r8, r9) + sha256_extend_doubleround_body(22, sp, r5, r6, r10, r4) + sha256_extend_doubleround_body(24, sp, r7, r8, r5, r6) + sha256_extend_doubleround_body(26, sp, r9, r10, r7, r8) + sha256_extend_doubleround_body(28, sp, r4, r5, r9, r10) + sha256_extend_doubleround_body(30, sp, r6, r7, r4, r5) + sha256_extend_doubleround_body(32, sp, r8, r9, r6, r7) + sha256_extend_doubleround_body(34, sp, r10, r4, r8, r9) + sha256_extend_doubleround_body(36, sp, r5, r6, r10, r4) + sha256_extend_doubleround_body(38, sp, r7, r8, r5, r6) + sha256_extend_doubleround_body(40, sp, r9, r10, r7, r8) + sha256_extend_doubleround_body(42, sp, r4, r5, r9, r10) + sha256_extend_doubleround_body(44, sp, r6, r7, r4, r5) + sha256_extend_doubleround_foot(46, sp, r8, r9, r6, r7) ldmia r0, {r4-r11} - sha256_main_quadround 0, sha256_transform_k, sp - sha256_main_quadround 4, sha256_transform_k, sp - sha256_main_quadround 8, sha256_transform_k, sp - sha256_main_quadround 12, sha256_transform_k, sp - sha256_main_quadround 16, sha256_transform_k, sp - sha256_main_quadround 20, sha256_transform_k, sp - sha256_main_quadround 24, sha256_transform_k, sp - sha256_main_quadround 28, sha256_transform_k, sp + sha256_main_quadround(0, sha256_transform_k, sp) + sha256_main_quadround(4, sha256_transform_k, sp) + sha256_main_quadround(8, sha256_transform_k, sp) + sha256_main_quadround(12, sha256_transform_k, sp) + sha256_main_quadround(16, sha256_transform_k, sp) + sha256_main_quadround(20, sha256_transform_k, sp) + sha256_main_quadround(24, sha256_transform_k, sp) + sha256_main_quadround(28, sha256_transform_k, sp) b sha256_transform_k_over sha256_transform_k: - sha256_k + sha256_k() sha256_transform_k_over: - sha256_main_quadround 32, sha256_transform_k, sp - sha256_main_quadround 36, sha256_transform_k, sp - sha256_main_quadround 40, sha256_transform_k, sp - sha256_main_quadround 44, sha256_transform_k, sp - sha256_main_quadround 48, sha256_transform_k, sp - sha256_main_quadround 52, sha256_transform_k, sp - sha256_main_quadround 56, sha256_transform_k, sp - sha256_main_quadround 60, sha256_transform_k, sp + sha256_main_quadround(32, sha256_transform_k, sp) + sha256_main_quadround(36, sha256_transform_k, sp) + sha256_main_quadround(40, sha256_transform_k, sp) + sha256_main_quadround(44, sha256_transform_k, sp) + sha256_main_quadround(48, sha256_transform_k, sp) + sha256_main_quadround(52, sha256_transform_k, sp) + sha256_main_quadround(56, sha256_transform_k, sp) + sha256_main_quadround(60, sha256_transform_k, sp) ldmia r0, {r1, r2, r3, r12} add r4, r4, r1 @@ -343,23 +341,23 @@ _sha256d_ms: ldr lr, [r1, #17*4] sha256d_ms_extend_loop2: - sha256_extend_doubleround_body 16, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 18, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 20, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 22, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 24, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 26, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 28, r1, r4, r5, r9, r10 - sha256_extend_doubleround_body 30, r1, r6, r7, r4, r5 - sha256_extend_doubleround_body 32, r1, r8, r9, r6, r7 - sha256_extend_doubleround_body 34, r1, r10, r4, r8, r9 - sha256_extend_doubleround_body 36, r1, r5, r6, r10, r4 - sha256_extend_doubleround_body 38, r1, r7, r8, r5, r6 - sha256_extend_doubleround_body 40, r1, r9, r10, r7, r8 - sha256_extend_doubleround_body 42, r1, r4, r5, r9, r10 + sha256_extend_doubleround_body(16, r1, r6, r7, r4, r5) + sha256_extend_doubleround_body(18, r1, r8, r9, r6, r7) + sha256_extend_doubleround_body(20, r1, r10, r4, r8, r9) + sha256_extend_doubleround_body(22, r1, r5, r6, r10, r4) + sha256_extend_doubleround_body(24, r1, r7, r8, r5, r6) + sha256_extend_doubleround_body(26, r1, r9, r10, r7, r8) + sha256_extend_doubleround_body(28, r1, r4, r5, r9, r10) + sha256_extend_doubleround_body(30, r1, r6, r7, r4, r5) + sha256_extend_doubleround_body(32, r1, r8, r9, r6, r7) + sha256_extend_doubleround_body(34, r1, r10, r4, r8, r9) + sha256_extend_doubleround_body(36, r1, r5, r6, r10, r4) + sha256_extend_doubleround_body(38, r1, r7, r8, r5, r6) + sha256_extend_doubleround_body(40, r1, r9, r10, r7, r8) + sha256_extend_doubleround_body(42, r1, r4, r5, r9, r10) bne sha256d_ms_extend_coda2 - sha256_extend_doubleround_body 44, r1, r6, r7, r4, r5 - sha256_extend_doubleround_foot 46, r1, r8, r9, r6, r7 + sha256_extend_doubleround_body(44, r1, r6, r7, r4, r5) + sha256_extend_doubleround_foot(46, r1, r8, r9, r6, r7) ldr r4, [r3, #0*4] ldr r9, [r3, #1*4] @@ -372,34 +370,34 @@ sha256d_ms_extend_loop2: b sha256d_ms_main_loop1 sha256d_ms_main_loop2: - sha256_main_round 0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 - sha256_main_round 1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 + sha256_main_round(0, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11) + sha256_main_round(1, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10) + sha256_main_round(2, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9) sha256d_ms_main_loop1: - sha256_main_round 3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 4, sha256d_ms_k, r1 - sha256_main_quadround 8, sha256d_ms_k, r1 - sha256_main_quadround 12, sha256d_ms_k, r1 - sha256_main_quadround 16, sha256d_ms_k, r1 - sha256_main_quadround 20, sha256d_ms_k, r1 - sha256_main_quadround 24, sha256d_ms_k, r1 - sha256_main_quadround 28, sha256d_ms_k, r1 + sha256_main_round(3, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8) + sha256_main_quadround(4, sha256d_ms_k, r1) + sha256_main_quadround(8, sha256d_ms_k, r1) + sha256_main_quadround(12, sha256d_ms_k, r1) + sha256_main_quadround(16, sha256d_ms_k, r1) + sha256_main_quadround(20, sha256d_ms_k, r1) + sha256_main_quadround(24, sha256d_ms_k, r1) + sha256_main_quadround(28, sha256d_ms_k, r1) b sha256d_ms_k_over sha256d_ms_k: - sha256_k + sha256_k() sha256d_ms_k_over: - sha256_main_quadround 32, sha256d_ms_k, r1 - sha256_main_quadround 36, sha256d_ms_k, r1 - sha256_main_quadround 40, sha256d_ms_k, r1 - sha256_main_quadround 44, sha256d_ms_k, r1 - sha256_main_quadround 48, sha256d_ms_k, r1 - sha256_main_quadround 52, sha256d_ms_k, r1 - sha256_main_round 56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11 + sha256_main_quadround(32, sha256d_ms_k, r1) + sha256_main_quadround(36, sha256d_ms_k, r1) + sha256_main_quadround(40, sha256d_ms_k, r1) + sha256_main_quadround(44, sha256d_ms_k, r1) + sha256_main_quadround(48, sha256d_ms_k, r1) + sha256_main_quadround(52, sha256d_ms_k, r1) + sha256_main_round(56, sha256d_ms_k, r1, r4, r5, r6, r7, r8, r9, r10, r11) bne sha256d_ms_finish - sha256_main_round 57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10 - sha256_main_round 58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9 - sha256_main_round 59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8 - sha256_main_quadround 60, sha256d_ms_k, r1 + sha256_main_round(57, sha256d_ms_k, r1, r7, r4, r5, r6, r11, r8, r9, r10) + sha256_main_round(58, sha256d_ms_k, r1, r6, r7, r4, r5, r10, r11, r8, r9) + sha256_main_round(59, sha256d_ms_k, r1, r5, r6, r7, r4, r9, r10, r11, r8) + sha256_main_quadround(60, sha256d_ms_k, r1) ldmia r2!, {r3, r12, lr} add r4, r4, r3 @@ -594,27 +592,27 @@ sha256d_ms_h: .long 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a .long 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 -.macro sha256_main_round_red i, ka, rw, rd, re, rf, rg, rh - ldr r12, [\rw, #(\i)*4] - and r3, \rf, \re - bic lr, \rg, \re - add \rh, \rh, \rd - orr lr, lr, r3 - ldr r3, \ka + (\i)*4 - add \rh, \rh, lr - eor lr, \re, \re, ror #5 - add \rh, \rh, r12 - eor lr, lr, \re, ror #19 - add \rh, \rh, r3 - add \rh, \rh, lr, ror #6 -.endm +#define sha256_main_round_red(i, ka, rw, rd, re, rf, rg, rh) \ + ldr r12, [rw, #(i)*4]; \ + and r3, rf, re; \ + bic lr, rg, re; \ + add rh, rh, rd; \ + orr lr, lr, r3; \ + ldr r3, ka + (i)*4; \ + add rh, rh, lr; \ + eor lr, re, re, ror #5; \ + add rh, rh, r12; \ + eor lr, lr, re, ror #19; \ + add rh, rh, r3; \ + add rh, rh, lr, ror #6; \ + sha256d_ms_finish: - sha256_main_round_red 57, sha256d_ms_k, r1, r6, r11, r8, r9, r10 - sha256_main_round_red 58, sha256d_ms_k, r1, r5, r10, r11, r8, r9 - sha256_main_round_red 59, sha256d_ms_k, r1, r4, r9, r10, r11, r8 + sha256_main_round_red(57, sha256d_ms_k, r1, r6, r11, r8, r9, r10) + sha256_main_round_red(58, sha256d_ms_k, r1, r5, r10, r11, r8, r9) + sha256_main_round_red(59, sha256d_ms_k, r1, r4, r9, r10, r11, r8) ldr r5, [r2, #7*4] - sha256_main_round_red 60, sha256d_ms_k, r1, r7, r8, r9, r10, r11 + sha256_main_round_red(60, sha256d_ms_k, r1, r7, r8, r9, r10, r11) add r11, r11, r5 str r11, [r0, #7*4] @@ -655,193 +653,191 @@ sha256_4h: .long 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab, 0x1f83d9ab .long 0x5be0cd19, 0x5be0cd19, 0x5be0cd19, 0x5be0cd19 -.macro sha256_4k - .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98 - .long 0x71374491, 0x71374491, 0x71374491, 0x71374491 - .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf - .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5 - .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b - .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1 - .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4 - .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5 - .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98 - .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01 - .long 0x243185be, 0x243185be, 0x243185be, 0x243185be - .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3 - .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74 - .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe - .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7 - .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174 - .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1 - .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786 - .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6 - .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc - .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f - .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa - .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc - .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da - .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152 - .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d - .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8 - .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7 - .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3 - .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147 - .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351 - .long 0x14292967, 0x14292967, 0x14292967, 0x14292967 - .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85 - .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138 - .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc - .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13 - .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354 - .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb - .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e - .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85 - .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1 - .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b - .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70 - .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3 - .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819 - .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624 - .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585 - .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070 - .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116 - .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08 - .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c - .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5 - .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3 - .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a - .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f - .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3 - .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee - .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f - .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814 - .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208 - .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa - .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb - .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7 - .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2 -.endm +#define sha256_4k() \ + .long 0x428a2f98, 0x428a2f98, 0x428a2f98, 0x428a2f98; \ + .long 0x71374491, 0x71374491, 0x71374491, 0x71374491; \ + .long 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf, 0xb5c0fbcf; \ + .long 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5, 0xe9b5dba5; \ + .long 0x3956c25b, 0x3956c25b, 0x3956c25b, 0x3956c25b; \ + .long 0x59f111f1, 0x59f111f1, 0x59f111f1, 0x59f111f1; \ + .long 0x923f82a4, 0x923f82a4, 0x923f82a4, 0x923f82a4; \ + .long 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5, 0xab1c5ed5; \ + .long 0xd807aa98, 0xd807aa98, 0xd807aa98, 0xd807aa98; \ + .long 0x12835b01, 0x12835b01, 0x12835b01, 0x12835b01; \ + .long 0x243185be, 0x243185be, 0x243185be, 0x243185be; \ + .long 0x550c7dc3, 0x550c7dc3, 0x550c7dc3, 0x550c7dc3; \ + .long 0x72be5d74, 0x72be5d74, 0x72be5d74, 0x72be5d74; \ + .long 0x80deb1fe, 0x80deb1fe, 0x80deb1fe, 0x80deb1fe; \ + .long 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7, 0x9bdc06a7; \ + .long 0xc19bf174, 0xc19bf174, 0xc19bf174, 0xc19bf174; \ + .long 0xe49b69c1, 0xe49b69c1, 0xe49b69c1, 0xe49b69c1; \ + .long 0xefbe4786, 0xefbe4786, 0xefbe4786, 0xefbe4786; \ + .long 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6, 0x0fc19dc6; \ + .long 0x240ca1cc, 0x240ca1cc, 0x240ca1cc, 0x240ca1cc; \ + .long 0x2de92c6f, 0x2de92c6f, 0x2de92c6f, 0x2de92c6f; \ + .long 0x4a7484aa, 0x4a7484aa, 0x4a7484aa, 0x4a7484aa; \ + .long 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc, 0x5cb0a9dc; \ + .long 0x76f988da, 0x76f988da, 0x76f988da, 0x76f988da; \ + .long 0x983e5152, 0x983e5152, 0x983e5152, 0x983e5152; \ + .long 0xa831c66d, 0xa831c66d, 0xa831c66d, 0xa831c66d; \ + .long 0xb00327c8, 0xb00327c8, 0xb00327c8, 0xb00327c8; \ + .long 0xbf597fc7, 0xbf597fc7, 0xbf597fc7, 0xbf597fc7; \ + .long 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3, 0xc6e00bf3; \ + .long 0xd5a79147, 0xd5a79147, 0xd5a79147, 0xd5a79147; \ + .long 0x06ca6351, 0x06ca6351, 0x06ca6351, 0x06ca6351; \ + .long 0x14292967, 0x14292967, 0x14292967, 0x14292967; \ + .long 0x27b70a85, 0x27b70a85, 0x27b70a85, 0x27b70a85; \ + .long 0x2e1b2138, 0x2e1b2138, 0x2e1b2138, 0x2e1b2138; \ + .long 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc, 0x4d2c6dfc; \ + .long 0x53380d13, 0x53380d13, 0x53380d13, 0x53380d13; \ + .long 0x650a7354, 0x650a7354, 0x650a7354, 0x650a7354; \ + .long 0x766a0abb, 0x766a0abb, 0x766a0abb, 0x766a0abb; \ + .long 0x81c2c92e, 0x81c2c92e, 0x81c2c92e, 0x81c2c92e; \ + .long 0x92722c85, 0x92722c85, 0x92722c85, 0x92722c85; \ + .long 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1, 0xa2bfe8a1; \ + .long 0xa81a664b, 0xa81a664b, 0xa81a664b, 0xa81a664b; \ + .long 0xc24b8b70, 0xc24b8b70, 0xc24b8b70, 0xc24b8b70; \ + .long 0xc76c51a3, 0xc76c51a3, 0xc76c51a3, 0xc76c51a3; \ + .long 0xd192e819, 0xd192e819, 0xd192e819, 0xd192e819; \ + .long 0xd6990624, 0xd6990624, 0xd6990624, 0xd6990624; \ + .long 0xf40e3585, 0xf40e3585, 0xf40e3585, 0xf40e3585; \ + .long 0x106aa070, 0x106aa070, 0x106aa070, 0x106aa070; \ + .long 0x19a4c116, 0x19a4c116, 0x19a4c116, 0x19a4c116; \ + .long 0x1e376c08, 0x1e376c08, 0x1e376c08, 0x1e376c08; \ + .long 0x2748774c, 0x2748774c, 0x2748774c, 0x2748774c; \ + .long 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5, 0x34b0bcb5; \ + .long 0x391c0cb3, 0x391c0cb3, 0x391c0cb3, 0x391c0cb3; \ + .long 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a, 0x4ed8aa4a; \ + .long 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f, 0x5b9cca4f; \ + .long 0x682e6ff3, 0x682e6ff3, 0x682e6ff3, 0x682e6ff3; \ + .long 0x748f82ee, 0x748f82ee, 0x748f82ee, 0x748f82ee; \ + .long 0x78a5636f, 0x78a5636f, 0x78a5636f, 0x78a5636f; \ + .long 0x84c87814, 0x84c87814, 0x84c87814, 0x84c87814; \ + .long 0x8cc70208, 0x8cc70208, 0x8cc70208, 0x8cc70208; \ + .long 0x90befffa, 0x90befffa, 0x90befffa, 0x90befffa; \ + .long 0xa4506ceb, 0xa4506ceb, 0xa4506ceb, 0xa4506ceb; \ + .long 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7, 0xbef9a3f7; \ + .long 0xc67178f2, 0xc67178f2, 0xc67178f2, 0xc67178f2; \ -.macro sha256_4way_extend_doubleround_core i, rr, rw, ra, rb, ry, rz - vadd.u32 q5, q5, \ra - veor.u32 q4, q4, q0 - vshr.u32 q0, \ry, #19 - vshl.u32 q1, \ry, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \ra, q6, #7 - vshl.u32 q0, q6, #32-7 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshr.u32 q1, \ry, #10 - vshr.u32 q0, q6, #18 - veor.u32 q4, q4, q1 - veor.u32 \ra, \ra, q0 - vshl.u32 q1, q6, #32-18 - vshr.u32 q0, q6, #3 - veor.u32 \ra, \ra, q1 - vadd.u32 q4, q4, q5 - veor.u32 \ra, \ra, q0 - vld1.u32 {q5}, [\rr]! - vadd.u32 \ra, \ra, q4 - vshr.u32 q4, \rz, #17 - vshl.u32 q0, \rz, #32-17 - vadd.u32 q6, q6, \rb - vst1.u32 {\ra}, [\rw]! - veor.u32 q4, q4, q0 - vshr.u32 q0, \rz, #19 - vshl.u32 q1, \rz, #32-19 - veor.u32 q4, q4, q0 - vshr.u32 \rb, q5, #7 - veor.u32 q4, q4, q1 - vshl.u32 q0, q5, #32-7 - vshr.u32 q1, \rz, #10 - veor.u32 \rb, \rb, q0 - vshr.u32 q0, q5, #18 - veor.u32 q4, q4, q1 - veor.u32 \rb, \rb, q0 - vshl.u32 q1, q5, #32-18 - vshr.u32 q0, q5, #3 - veor.u32 \rb, \rb, q1 - vadd.u32 q1, q6, q4 - veor.u32 \rb, \rb, q0 -.endm +#define sha256_4way_extend_doubleround_core(i, rr, rw, ra, rb, ry, rz) \ + vadd.u32 q5, q5, ra; \ + veor.u32 q4, q4, q0; \ + vshr.u32 q0, ry, #19; \ + vshl.u32 q1, ry, #32-19; \ + veor.u32 q4, q4, q0; \ + vshr.u32 ra, q6, #7; \ + vshl.u32 q0, q6, #32-7; \ + veor.u32 q4, q4, q1; \ + veor.u32 ra, ra, q0; \ + vshr.u32 q1, ry, #10; \ + vshr.u32 q0, q6, #18; \ + veor.u32 q4, q4, q1; \ + veor.u32 ra, ra, q0; \ + vshl.u32 q1, q6, #32-18; \ + vshr.u32 q0, q6, #3; \ + veor.u32 ra, ra, q1; \ + vadd.u32 q4, q4, q5; \ + veor.u32 ra, ra, q0; \ + vld1.u32 {q5}, [rr]!; \ + vadd.u32 ra, ra, q4; \ + vshr.u32 q4, rz, #17; \ + vshl.u32 q0, rz, #32-17; \ + vadd.u32 q6, q6, rb; \ + vst1.u32 {ra}, [rw]!; \ + veor.u32 q4, q4, q0; \ + vshr.u32 q0, rz, #19; \ + vshl.u32 q1, rz, #32-19; \ + veor.u32 q4, q4, q0; \ + vshr.u32 rb, q5, #7; \ + veor.u32 q4, q4, q1; \ + vshl.u32 q0, q5, #32-7; \ + vshr.u32 q1, rz, #10; \ + veor.u32 rb, rb, q0; \ + vshr.u32 q0, q5, #18; \ + veor.u32 q4, q4, q1; \ + veor.u32 rb, rb, q0; \ + vshl.u32 q1, q5, #32-18; \ + vshr.u32 q0, q5, #3; \ + veor.u32 rb, rb, q1; \ + vadd.u32 q1, q6, q4; \ + veor.u32 rb, rb, q0; \ + + +#define sha256_4way_extend_doubleround_head(i, rr, rw, ra, rb, ry, rz) \ + vld1.u32 {q6}, [rr]!; \ + vshr.u32 q4, ry, #17; \ + vshl.u32 q0, ry, #32-17; \ + sha256_4way_extend_doubleround_core(i, rr, rw, ra, rb, ry, rz); \ + vld1.u32 {q6}, [rr]!; \ + vadd.u32 rb, rb, q1; \ -.macro sha256_4way_extend_doubleround_head i, rr, rw, ra, rb, ry, rz - vld1.u32 {q6}, [\rr]! - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm -.macro sha256_4way_extend_doubleround_body i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vld1.u32 {q6}, [\rr]! - vadd.u32 \rb, \rb, q1 -.endm +#define sha256_4way_extend_doubleround_body(i, rr, rw, ra, rb, ry, rz) \ + vshr.u32 q4, ry, #17; \ + vshl.u32 q0, ry, #32-17; \ + vst1.u32 {rz}, [rw]!; \ + sha256_4way_extend_doubleround_core(i, rr, rw, ra, rb, ry, rz); \ + vld1.u32 {q6}, [rr]!; \ + vadd.u32 rb, rb, q1; \ -.macro sha256_4way_extend_doubleround_foot i, rr, rw, ra, rb, ry, rz - vshr.u32 q4, \ry, #17 - vshl.u32 q0, \ry, #32-17 - vst1.u32 {\rz}, [\rw]! - sha256_4way_extend_doubleround_core \i, \rr, \rw, \ra, \rb, \ry, \rz - vadd.u32 \rb, \rb, q1 - vst1.u32 {\rb}, [\rw]! -.endm -.macro sha256_4way_main_round i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 - vorr.u32 q10, q10, q9 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, q10 - vshl.u32 q12, \re, #32-5 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 - veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 - veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 - veor.u32 q10, q10, q12 - vadd.u32 \rh, \rh, q9 - veor.u32 q9, \ra, \rb - vshr.u32 q11, q10, #6 - vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 +#define sha256_4way_extend_doubleround_foot(i, rr, rw, ra, rb, ry, rz) \ + vshr.u32 q4, ry, #17; \ + vshl.u32 q0, ry, #32-17; \ + vst1.u32 {rz}, [rw]!; \ + sha256_4way_extend_doubleround_core(i, rr, rw, ra, rb, ry, rz); \ + vadd.u32 rb, rb, q1; \ + vst1.u32 {rb}, [rw]!; \ - vshr.u32 q11, \ra, #11 - vshl.u32 q12, \ra, #32-11 - veor.u32 q8, \ra, q11 - vand.u32 q10, \ra, \rb - veor.u32 q8, q8, q12 - vshr.u32 q11, \ra, #20 - vshl.u32 q12, \ra, #32-20 - veor.u32 q8, q8, q11 - vand.u32 q9, q9, \rc - veor.u32 q8, q8, q12 - vadd.u32 \rh, \rh, q13 - veor.u32 q10, q10, q9 - vshr.u32 q11, q8, #2 - vshl.u32 q12, q8, #32-2 - vadd.u32 q9, \rh, q10 - vadd.u32 q12, q12, q11 - vadd.u32 \rh, \rh, \rd - vadd.u32 \rd, q9, q12 -.endm -.macro sha256_4way_main_quadround i, rk, rw - sha256_4way_main_round \i+0, \rk, \rw, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round \i+1, \rk, \rw, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round \i+2, \rk, \rw, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round \i+3, \rk, \rw, q1, q2, q3, q0, q5, q6, q7, q4 -.endm +#define sha256_4way_main_round(i, rk, rw, ra, rb, rc, rd, re, rf, rg, rh) \ + vld1.u32 {q8}, [rw]!; \ + vand.u32 q9, rf, re; \ + vbic.u32 q10, rg, re; \ + vshr.u32 q11, re, #5; \ + vorr.u32 q10, q10, q9; \ + vld1.u32 {q9}, [rk]!; \ + vadd.u32 rh, rh, q10; \ + vshl.u32 q12, re, #32-5; \ + veor.u32 q10, re, q11; \ + vshr.u32 q11, re, #19; \ + veor.u32 q10, q10, q12; \ + vshl.u32 q12, re, #32-19; \ + veor.u32 q10, q10, q11; \ + vadd.u32 rh, rh, q8; \ + veor.u32 q10, q10, q12; \ + vadd.u32 rh, rh, q9; \ + veor.u32 q9, ra, rb; \ + vshr.u32 q11, q10, #6; \ + vshl.u32 q13, q10, #32-6; \ + vadd.u32 rh, rh, q11; \ + vshr.u32 q11, ra, #11; \ + vshl.u32 q12, ra, #32-11; \ + veor.u32 q8, ra, q11; \ + vand.u32 q10, ra, rb; \ + veor.u32 q8, q8, q12; \ + vshr.u32 q11, ra, #20; \ + vshl.u32 q12, ra, #32-20; \ + veor.u32 q8, q8, q11; \ + vand.u32 q9, q9, rc; \ + veor.u32 q8, q8, q12; \ + vadd.u32 rh, rh, q13; \ + veor.u32 q10, q10, q9; \ + vshr.u32 q11, q8, #2; \ + vshl.u32 q12, q8, #32-2; \ + vadd.u32 q9, rh, q10; \ + vadd.u32 q12, q12, q11; \ + vadd.u32 rh, rh, rd; \ + vadd.u32 rd, q9, q12; \ + + +#define sha256_4way_main_quadround(i, rk, rw) \ + sha256_4way_main_round(i+0, rk, rw, q0, q1, q2, q3, q4, q5, q6, q7); \ + sha256_4way_main_round(i+1, rk, rw, q3, q0, q1, q2, q7, q4, q5, q6); \ + sha256_4way_main_round(i+2, rk, rw, q2, q3, q0, q1, q6, q7, q4, q5); \ + sha256_4way_main_round(i+3, rk, rw, q1, q2, q3, q0, q5, q6, q7, q4); \ + .text @@ -896,54 +892,54 @@ sha256_transform_4way_extend: add r1, sp, #1*16 add r2, sp, #16*16 vmov.u32 q5, q0 - sha256_4way_extend_doubleround_head 0, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 2, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 4, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 6, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 8, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 10, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 12, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 14, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 16, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r1, r2, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r1, r2, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r1, r2, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r1, r2, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r1, r2, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r1, r2, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r1, r2, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r1, r2, q13, q14, q11, q12 + sha256_4way_extend_doubleround_head(0, r1, r2, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(2, r1, r2, q11, q12, q9, q10) + sha256_4way_extend_doubleround_body(4, r1, r2, q13, q14, q11, q12) + sha256_4way_extend_doubleround_body(6, r1, r2, q15, q9, q13, q14) + sha256_4way_extend_doubleround_body(8, r1, r2, q10, q11, q15, q9) + sha256_4way_extend_doubleround_body(10, r1, r2, q12, q13, q10, q11) + sha256_4way_extend_doubleround_body(12, r1, r2, q14, q15, q12, q13) + sha256_4way_extend_doubleround_body(14, r1, r2, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(16, r1, r2, q11, q12, q9, q10) + sha256_4way_extend_doubleround_body(18, r1, r2, q13, q14, q11, q12) + sha256_4way_extend_doubleround_body(20, r1, r2, q15, q9, q13, q14) + sha256_4way_extend_doubleround_body(22, r1, r2, q10, q11, q15, q9) + sha256_4way_extend_doubleround_body(24, r1, r2, q12, q13, q10, q11) + sha256_4way_extend_doubleround_body(26, r1, r2, q14, q15, q12, q13) + sha256_4way_extend_doubleround_body(28, r1, r2, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(30, r1, r2, q11, q12, q9, q10) + sha256_4way_extend_doubleround_body(32, r1, r2, q13, q14, q11, q12) + sha256_4way_extend_doubleround_body(34, r1, r2, q15, q9, q13, q14) + sha256_4way_extend_doubleround_body(36, r1, r2, q10, q11, q15, q9) + sha256_4way_extend_doubleround_body(38, r1, r2, q12, q13, q10, q11) + sha256_4way_extend_doubleround_body(40, r1, r2, q14, q15, q12, q13) + sha256_4way_extend_doubleround_body(42, r1, r2, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(44, r1, r2, q11, q12, q9, q10) + sha256_4way_extend_doubleround_foot(46, r1, r2, q13, q14, q11, q12) vldmia r0, {q0-q7} adr r4, sha256_transform_4way_4k b sha256_transform_4way_4k_over .align 4 sha256_transform_4way_4k: - sha256_4k + sha256_4k() sha256_transform_4way_4k_over: - sha256_4way_main_quadround 0, r4, sp - sha256_4way_main_quadround 4, r4, sp - sha256_4way_main_quadround 8, r4, sp - sha256_4way_main_quadround 12, r4, sp - sha256_4way_main_quadround 16, r4, sp - sha256_4way_main_quadround 20, r4, sp - sha256_4way_main_quadround 24, r4, sp - sha256_4way_main_quadround 28, r4, sp - sha256_4way_main_quadround 32, r4, sp - sha256_4way_main_quadround 36, r4, sp - sha256_4way_main_quadround 40, r4, sp - sha256_4way_main_quadround 44, r4, sp - sha256_4way_main_quadround 48, r4, sp - sha256_4way_main_quadround 52, r4, sp - sha256_4way_main_quadround 56, r4, sp - sha256_4way_main_quadround 60, r4, sp + sha256_4way_main_quadround(0, r4, sp) + sha256_4way_main_quadround(4, r4, sp) + sha256_4way_main_quadround(8, r4, sp) + sha256_4way_main_quadround(12, r4, sp) + sha256_4way_main_quadround(16, r4, sp) + sha256_4way_main_quadround(20, r4, sp) + sha256_4way_main_quadround(24, r4, sp) + sha256_4way_main_quadround(28, r4, sp) + sha256_4way_main_quadround(32, r4, sp) + sha256_4way_main_quadround(36, r4, sp) + sha256_4way_main_quadround(40, r4, sp) + sha256_4way_main_quadround(44, r4, sp) + sha256_4way_main_quadround(48, r4, sp) + sha256_4way_main_quadround(52, r4, sp) + sha256_4way_main_quadround(56, r4, sp) + sha256_4way_main_quadround(60, r4, sp) vldmia r0, {q8-q15} vadd.u32 q0, q0, q8 @@ -1151,22 +1147,22 @@ _sha256d_ms_4way: sub sp, sp, #8*16 sha256d_ms_4way_extend_loop2: - sha256_4way_extend_doubleround_body 16, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 18, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 20, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 22, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 24, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 26, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 28, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 30, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_body 32, r4, r1, q13, q14, q11, q12 - sha256_4way_extend_doubleround_body 34, r4, r1, q15, q9, q13, q14 - sha256_4way_extend_doubleround_body 36, r4, r1, q10, q11, q15, q9 - sha256_4way_extend_doubleround_body 38, r4, r1, q12, q13, q10, q11 - sha256_4way_extend_doubleround_body 40, r4, r1, q14, q15, q12, q13 - sha256_4way_extend_doubleround_body 42, r4, r1, q9, q10, q14, q15 - sha256_4way_extend_doubleround_body 44, r4, r1, q11, q12, q9, q10 - sha256_4way_extend_doubleround_foot 46, r4, r1, q13, q14, q11, q12 + sha256_4way_extend_doubleround_body(16, r4, r1, q11, q12, q9, q10) + sha256_4way_extend_doubleround_body(18, r4, r1, q13, q14, q11, q12) + sha256_4way_extend_doubleround_body(20, r4, r1, q15, q9, q13, q14) + sha256_4way_extend_doubleround_body(22, r4, r1, q10, q11, q15, q9) + sha256_4way_extend_doubleround_body(24, r4, r1, q12, q13, q10, q11) + sha256_4way_extend_doubleround_body(26, r4, r1, q14, q15, q12, q13) + sha256_4way_extend_doubleround_body(28, r4, r1, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(30, r4, r1, q11, q12, q9, q10) + sha256_4way_extend_doubleround_body(32, r4, r1, q13, q14, q11, q12) + sha256_4way_extend_doubleround_body(34, r4, r1, q15, q9, q13, q14) + sha256_4way_extend_doubleround_body(36, r4, r1, q10, q11, q15, q9) + sha256_4way_extend_doubleround_body(38, r4, r1, q12, q13, q10, q11) + sha256_4way_extend_doubleround_body(40, r4, r1, q14, q15, q12, q13) + sha256_4way_extend_doubleround_body(42, r4, r1, q9, q10, q14, q15) + sha256_4way_extend_doubleround_body(44, r4, r1, q11, q12, q9, q10) + sha256_4way_extend_doubleround_foot(46, r4, r1, q13, q14, q11, q12) bne sha256d_ms_4way_extend_coda2 vldmia r3!, {q4-q7} @@ -1178,33 +1174,33 @@ sha256d_ms_4way_extend_loop2: .align 4 sha256d_ms_4way_4k: - sha256_4k + sha256_4k() sha256d_ms_4way_main_loop2: - sha256_4way_main_round 0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 - sha256_4way_main_round 1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 + sha256_4way_main_round(0, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7) + sha256_4way_main_round(1, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6) + sha256_4way_main_round(2, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5) sha256d_ms_4way_main_loop1: - sha256_4way_main_round 3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 4, r3, r1 - sha256_4way_main_quadround 8, r3, r1 - sha256_4way_main_quadround 12, r3, r1 - sha256_4way_main_quadround 16, r3, r1 - sha256_4way_main_quadround 20, r3, r1 - sha256_4way_main_quadround 24, r3, r1 - sha256_4way_main_quadround 28, r3, r1 - sha256_4way_main_quadround 32, r3, r1 - sha256_4way_main_quadround 36, r3, r1 - sha256_4way_main_quadround 40, r3, r1 - sha256_4way_main_quadround 44, r3, r1 - sha256_4way_main_quadround 48, r3, r1 - sha256_4way_main_quadround 52, r3, r1 - sha256_4way_main_round 56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7 + sha256_4way_main_round(3, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4) + sha256_4way_main_quadround(4, r3, r1) + sha256_4way_main_quadround(8, r3, r1) + sha256_4way_main_quadround(12, r3, r1) + sha256_4way_main_quadround(16, r3, r1) + sha256_4way_main_quadround(20, r3, r1) + sha256_4way_main_quadround(24, r3, r1) + sha256_4way_main_quadround(28, r3, r1) + sha256_4way_main_quadround(32, r3, r1) + sha256_4way_main_quadround(36, r3, r1) + sha256_4way_main_quadround(40, r3, r1) + sha256_4way_main_quadround(44, r3, r1) + sha256_4way_main_quadround(48, r3, r1) + sha256_4way_main_quadround(52, r3, r1) + sha256_4way_main_round(56, r3, r1, q0, q1, q2, q3, q4, q5, q6, q7) bne sha256d_ms_4way_finish - sha256_4way_main_round 57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6 - sha256_4way_main_round 58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5 - sha256_4way_main_round 59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4 - sha256_4way_main_quadround 60, r3, r1 + sha256_4way_main_round(57, r3, r1, q3, q0, q1, q2, q7, q4, q5, q6) + sha256_4way_main_round(58, r3, r1, q2, q3, q0, q1, q6, q7, q4, q5) + sha256_4way_main_round(59, r3, r1, q1, q2, q3, q0, q5, q6, q7, q4) + sha256_4way_main_quadround(60, r3, r1) vldmia r2, {q8-q15} vadd.u32 q0, q0, q8 @@ -1526,35 +1522,35 @@ sha256d_ms_4way_extend_coda2: sub r3, r3, #64*16 b sha256d_ms_4way_main_loop2 -.macro sha256_4way_main_round_red i, rk, rw, rd, re, rf, rg, rh - vld1.u32 {q8}, [\rw]! - vand.u32 q9, \rf, \re - vbic.u32 q10, \rg, \re - vshr.u32 q11, \re, #5 - vorr.u32 q10, q10, q9 - vshl.u32 q12, \re, #32-5 - vadd.u32 \rh, \rh, q10 - veor.u32 q10, \re, q11 - vshr.u32 q11, \re, #19 - veor.u32 q10, q10, q12 - vshl.u32 q12, \re, #32-19 - veor.u32 q10, q10, q11 - vadd.u32 \rh, \rh, q8 - veor.u32 q10, q10, q12 - vld1.u32 {q9}, [\rk]! - vadd.u32 \rh, \rh, \rd - vshr.u32 q11, q10, #6 - vadd.u32 \rh, \rh, q9 - vshl.u32 q13, q10, #32-6 - vadd.u32 \rh, \rh, q11 - vadd.u32 \rh, \rh, q13 -.endm +#define sha256_4way_main_round_red(i, rk, rw, rd, re, rf, rg, rh) \ + vld1.u32 {q8}, [rw]!; \ + vand.u32 q9, rf, re; \ + vbic.u32 q10, rg, re; \ + vshr.u32 q11, re, #5; \ + vorr.u32 q10, q10, q9; \ + vshl.u32 q12, re, #32-5; \ + vadd.u32 rh, rh, q10; \ + veor.u32 q10, re, q11; \ + vshr.u32 q11, re, #19; \ + veor.u32 q10, q10, q12; \ + vshl.u32 q12, re, #32-19; \ + veor.u32 q10, q10, q11; \ + vadd.u32 rh, rh, q8; \ + veor.u32 q10, q10, q12; \ + vld1.u32 {q9}, [rk]!; \ + vadd.u32 rh, rh, rd; \ + vshr.u32 q11, q10, #6; \ + vadd.u32 rh, rh, q9; \ + vshl.u32 q13, q10, #32-6; \ + vadd.u32 rh, rh, q11; \ + vadd.u32 rh, rh, q13; \ + sha256d_ms_4way_finish: - sha256_4way_main_round_red 57, r3, r1, q2, q7, q4, q5, q6 - sha256_4way_main_round_red 58, r3, r1, q1, q6, q7, q4, q5 - sha256_4way_main_round_red 59, r3, r1, q0, q5, q6, q7, q4 - sha256_4way_main_round_red 60, r3, r1, q3, q4, q5, q6, q7 + sha256_4way_main_round_red(57, r3, r1, q2, q7, q4, q5, q6) + sha256_4way_main_round_red(58, r3, r1, q1, q6, q7, q4, q5) + sha256_4way_main_round_red(59, r3, r1, q0, q5, q6, q7, q4) + sha256_4way_main_round_red(60, r3, r1, q3, q4, q5, q6, q7) vadd.u32 q7, q7, q15 add r0, r0, #7*16 diff --git a/sha2-x64.S b/sha2-x64.S index 0326d9d4a..3eeb00707 100644 --- a/sha2-x64.S +++ b/sha2-x64.S @@ -269,543 +269,499 @@ _sha256_init_8way: #endif /* USE_AVX2 */ -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-7)*16(%rax), %xmm0 - - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%rax), %xmm0 - movdqa (\i-14)*16(%rax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - - paddd (\i-16)*16(%rax), %xmm0 - paddd (\i-15)*16(%rax), %xmm4 - - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 - - paddd (\i-7)*16(%rax), %xmm0 - paddd (\i-6)*16(%rax), %xmm4 - - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%rax) - movdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%rax), %xmm6 - - movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 - pandn %xmm2, %xmm1 - paddd 32(%rsp), %xmm6 - - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) +#define sha256_sse2_extend_round(i) \ + movdqa (i-15)*16(%rax), %xmm0; \ + movdqa %xmm0, %xmm2; \ + psrld $3, %xmm0; \ + movdqa %xmm0, %xmm1; \ + pslld $14, %xmm2; \ + psrld $4, %xmm1; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + psrld $11, %xmm1; \ + pslld $11, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + paddd (i-16)*16(%rax), %xmm0; \ + paddd (i-7)*16(%rax), %xmm0; \ + movdqa %xmm3, %xmm2; \ + psrld $10, %xmm3; \ + pslld $13, %xmm2; \ + movdqa %xmm3, %xmm1; \ + psrld $7, %xmm1; \ + pxor %xmm1, %xmm3; \ + pxor %xmm2, %xmm3; \ + psrld $2, %xmm1; \ + pslld $2, %xmm2; \ + pxor %xmm1, %xmm3; \ + pxor %xmm2, %xmm3; \ + paddd %xmm0, %xmm3; \ + movdqa %xmm3, i*16(%rax); \ + + +#define sha256_sse2_extend_doubleround(i) \ + movdqa (i-15)*16(%rax), %xmm0; \ + movdqa (i-14)*16(%rax), %xmm4; \ + movdqa %xmm0, %xmm2; \ + movdqa %xmm4, %xmm6; \ + psrld $3, %xmm0; \ + psrld $3, %xmm4; \ + movdqa %xmm0, %xmm1; \ + movdqa %xmm4, %xmm5; \ + pslld $14, %xmm2; \ + pslld $14, %xmm6; \ + psrld $4, %xmm1; \ + psrld $4, %xmm5; \ + pxor %xmm1, %xmm0; \ + pxor %xmm5, %xmm4; \ + psrld $11, %xmm1; \ + psrld $11, %xmm5; \ + pxor %xmm2, %xmm0; \ + pxor %xmm6, %xmm4; \ + pslld $11, %xmm2; \ + pslld $11, %xmm6; \ + pxor %xmm1, %xmm0; \ + pxor %xmm5, %xmm4; \ + pxor %xmm2, %xmm0; \ + pxor %xmm6, %xmm4; \ + paddd (i-16)*16(%rax), %xmm0; \ + paddd (i-15)*16(%rax), %xmm4; \ + movdqa %xmm3, %xmm2; \ + movdqa %xmm7, %xmm6; \ + psrld $10, %xmm3; \ + psrld $10, %xmm7; \ + movdqa %xmm3, %xmm1; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm2; \ + pslld $13, %xmm6; \ + psrld $7, %xmm1; \ + psrld $7, %xmm5; \ + paddd (i-7)*16(%rax), %xmm0; \ + paddd (i-6)*16(%rax), %xmm4; \ + pxor %xmm1, %xmm3; \ + pxor %xmm5, %xmm7; \ + psrld $2, %xmm1; \ + psrld $2, %xmm5; \ + pxor %xmm2, %xmm3; \ + pxor %xmm6, %xmm7; \ + pslld $2, %xmm2; \ + pslld $2, %xmm6; \ + pxor %xmm1, %xmm3; \ + pxor %xmm5, %xmm7; \ + pxor %xmm2, %xmm3; \ + pxor %xmm6, %xmm7; \ + paddd %xmm0, %xmm3; \ + paddd %xmm4, %xmm7; \ + movdqa %xmm3, i*16(%rax); \ + movdqa %xmm7, (i+1)*16(%rax); \ + + +#define sha256_sse2_main_round(i) \ + movdqa 16*(i)(%rax), %xmm6; \ + movdqa %xmm0, %xmm1; \ + movdqa 16(%rsp), %xmm2; \ + pandn %xmm2, %xmm1; \ + paddd 32(%rsp), %xmm6; \ + movdqa %xmm2, 32(%rsp); \ + movdqa 0(%rsp), %xmm2; \ + movdqa %xmm2, 16(%rsp); \ + pand %xmm0, %xmm2; \ + pxor %xmm2, %xmm1; \ + movdqa %xmm0, 0(%rsp); \ + paddd %xmm1, %xmm6; \ + movdqa %xmm0, %xmm1; \ + psrld $6, %xmm0; \ + paddd 16*(i)(%rcx), %xmm6; \ + movdqa %xmm0, %xmm2; \ + pslld $7, %xmm1; \ + psrld $5, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $14, %xmm1; \ + psrld $14, %xmm2; \ + pxor %xmm1, %xmm0; \ + pslld $5, %xmm1; \ + pxor %xmm2, %xmm0; \ + pxor %xmm1, %xmm0; \ + movdqa %xmm5, %xmm1; \ + paddd %xmm0, %xmm6; \ + movdqa %xmm3, %xmm0; \ + movdqa %xmm4, %xmm3; \ + movdqa %xmm4, %xmm2; \ + paddd %xmm6, %xmm0; \ + pand %xmm5, %xmm2; \ + pand %xmm7, %xmm1; \ + pand %xmm7, %xmm4; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm5, %xmm4; \ + movdqa %xmm7, %xmm5; \ + pxor %xmm2, %xmm1; \ + paddd %xmm1, %xmm6; \ + movdqa %xmm7, %xmm2; \ + psrld $2, %xmm7; \ + movdqa %xmm7, %xmm1; \ + pslld $10, %xmm2; \ + psrld $11, %xmm1; \ + pxor %xmm2, %xmm7; \ + pslld $9, %xmm2; \ + pxor %xmm1, %xmm7; \ + psrld $9, %xmm1; \ + pxor %xmm2, %xmm7; \ + pslld $11, %xmm2; \ + pxor %xmm1, %xmm7; \ + pxor %xmm2, %xmm7; \ + paddd %xmm6, %xmm7; \ + + +#define sha256_sse2_main_quadround(i) \ + sha256_sse2_main_round(i+0); \ + sha256_sse2_main_round(i+1); \ + sha256_sse2_main_round(i+2); \ + sha256_sse2_main_round(i+3); \ - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - paddd 16*(\i)(%rcx), %xmm6 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pslld $5, %xmm1 - pxor %xmm2, %xmm0 - pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 - pand %xmm5, %xmm2 - pand %xmm7, %xmm1 - pand %xmm7, %xmm4 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pslld $9, %xmm2 - pxor %xmm1, %xmm7 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pslld $11, %xmm2 - pxor %xmm1, %xmm7 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 -.endm - -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm #if defined(USE_AVX) -.macro sha256_avx_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 - vpslld $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpsrld $4, %xmm0, %xmm1 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpsrld $11, %xmm1, %xmm1 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm1, %xmm0, %xmm0 - vpxor %xmm2, %xmm0, %xmm0 - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - - vpslld $13, %xmm3, %xmm2 - vpsrld $10, %xmm3, %xmm3 - vpsrld $7, %xmm3, %xmm1 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm2, %xmm3, %xmm3 - vpsrld $2, %xmm1, %xmm1 - vpslld $2, %xmm2, %xmm2 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_avx_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 - vpslld $14, %xmm0, %xmm2 - vpslld $14, %xmm4, %xmm6 - vpsrld $3, %xmm0, %xmm8 - vpsrld $3, %xmm4, %xmm4 - vpsrld $7, %xmm0, %xmm1 - vpsrld $4, %xmm4, %xmm5 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpsrld $11, %xmm1, %xmm1 - vpsrld $11, %xmm5, %xmm5 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - vpslld $11, %xmm2, %xmm2 - vpslld $11, %xmm6, %xmm6 - vpxor %xmm1, %xmm8, %xmm8 - vpxor %xmm5, %xmm4, %xmm4 - vpxor %xmm2, %xmm8, %xmm8 - vpxor %xmm6, %xmm4, %xmm4 - - vpaddd %xmm0, %xmm4, %xmm4 - vpaddd (\i-16)*16(%rax), %xmm8, %xmm0 +#define sha256_avx_extend_round(i) \ + vmovdqa (i-15)*16(%rax), %xmm0; \ + vpslld $14, %xmm0, %xmm2; \ + vpsrld $3, %xmm0, %xmm0; \ + vpsrld $4, %xmm0, %xmm1; \ + vpxor %xmm1, %xmm0, %xmm0; \ + vpxor %xmm2, %xmm0, %xmm0; \ + vpsrld $11, %xmm1, %xmm1; \ + vpslld $11, %xmm2, %xmm2; \ + vpxor %xmm1, %xmm0, %xmm0; \ + vpxor %xmm2, %xmm0, %xmm0; \ + vpaddd (i-16)*16(%rax), %xmm0, %xmm0; \ + vpaddd (i-7)*16(%rax), %xmm0, %xmm0; \ + vpslld $13, %xmm3, %xmm2; \ + vpsrld $10, %xmm3, %xmm3; \ + vpsrld $7, %xmm3, %xmm1; \ + vpxor %xmm1, %xmm3, %xmm3; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpsrld $2, %xmm1, %xmm1; \ + vpslld $2, %xmm2, %xmm2; \ + vpxor %xmm1, %xmm3, %xmm3; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpaddd %xmm0, %xmm3, %xmm3; \ + vmovdqa %xmm3, i*16(%rax); \ + + +#define sha256_avx_extend_doubleround(i) \ + vmovdqa (i-15)*16(%rax), %xmm0; \ + vmovdqa (i-14)*16(%rax), %xmm4; \ + vpslld $14, %xmm0, %xmm2; \ + vpslld $14, %xmm4, %xmm6; \ + vpsrld $3, %xmm0, %xmm8; \ + vpsrld $3, %xmm4, %xmm4; \ + vpsrld $7, %xmm0, %xmm1; \ + vpsrld $4, %xmm4, %xmm5; \ + vpxor %xmm1, %xmm8, %xmm8; \ + vpxor %xmm5, %xmm4, %xmm4; \ + vpsrld $11, %xmm1, %xmm1; \ + vpsrld $11, %xmm5, %xmm5; \ + vpxor %xmm2, %xmm8, %xmm8; \ + vpxor %xmm6, %xmm4, %xmm4; \ + vpslld $11, %xmm2, %xmm2; \ + vpslld $11, %xmm6, %xmm6; \ + vpxor %xmm1, %xmm8, %xmm8; \ + vpxor %xmm5, %xmm4, %xmm4; \ + vpxor %xmm2, %xmm8, %xmm8; \ + vpxor %xmm6, %xmm4, %xmm4; \ + vpaddd %xmm0, %xmm4, %xmm4; \ + vpaddd (i-16)*16(%rax), %xmm8, %xmm0; \ + vpslld $13, %xmm3, %xmm2; \ + vpslld $13, %xmm7, %xmm6; \ + vpsrld $10, %xmm3, %xmm3; \ + vpsrld $10, %xmm7, %xmm7; \ + vpaddd (i-7)*16(%rax), %xmm0, %xmm0; \ + vpaddd (i-6)*16(%rax), %xmm4, %xmm4; \ + vpsrld $7, %xmm3, %xmm1; \ + vpsrld $7, %xmm7, %xmm5; \ + vpxor %xmm1, %xmm3, %xmm3; \ + vpxor %xmm5, %xmm7, %xmm7; \ + vpsrld $2, %xmm1, %xmm1; \ + vpsrld $2, %xmm5, %xmm5; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpxor %xmm6, %xmm7, %xmm7; \ + vpslld $2, %xmm2, %xmm2; \ + vpslld $2, %xmm6, %xmm6; \ + vpxor %xmm1, %xmm3, %xmm3; \ + vpxor %xmm5, %xmm7, %xmm7; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpxor %xmm6, %xmm7, %xmm7; \ + vpaddd %xmm0, %xmm3, %xmm3; \ + vpaddd %xmm4, %xmm7, %xmm7; \ + vmovdqa %xmm3, i*16(%rax); \ + vmovdqa %xmm7, (i+1)*16(%rax); \ + + +#define sha256_avx_main_round(i, r0, r1, r2, r3, r4, r5, r6, r7) \ + vpaddd 16*(i)(%rax), r0, %xmm6; \ + vpaddd 16*(i)(%rcx), %xmm6, %xmm6; \ + vpandn r1, r3, %xmm1; \ + vpand r3, r2, %xmm2; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vpslld $7, r3, %xmm1; \ + vpsrld $6, r3, r0; \ + vpsrld $5, r0, %xmm2; \ + vpxor %xmm1, r0, r0; \ + vpxor %xmm2, r0, r0; \ + vpslld $14, %xmm1, %xmm1; \ + vpsrld $14, %xmm2, %xmm2; \ + vpxor %xmm1, r0, r0; \ + vpxor %xmm2, r0, r0; \ + vpslld $5, %xmm1, %xmm1; \ + vpxor %xmm1, r0, r0; \ + vpaddd r0, %xmm6, %xmm6; \ + vpaddd %xmm6, r4, r0; \ + vpand r6, r5, %xmm2; \ + vpand r7, r5, r4; \ + vpand r7, r6, %xmm1; \ + vpxor r4, %xmm1, %xmm1; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vpslld $10, r7, %xmm2; \ + vpsrld $2, r7, r4; \ + vpsrld $11, r4, %xmm1; \ + vpxor %xmm2, r4, r4; \ + vpxor %xmm1, r4, r4; \ + vpslld $9, %xmm2, %xmm2; \ + vpsrld $9, %xmm1, %xmm1; \ + vpxor %xmm2, r4, r4; \ + vpxor %xmm1, r4, r4; \ + vpslld $11, %xmm2, %xmm2; \ + vpxor %xmm2, r4, r4; \ + vpaddd %xmm6, r4, r4; \ + + +#define sha256_avx_main_quadround(i) \ + sha256_avx_main_round(i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7); \ + sha256_avx_main_round(i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3); \ + sha256_avx_main_round(i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4); \ + sha256_avx_main_round(i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5); \ - vpslld $13, %xmm3, %xmm2 - vpslld $13, %xmm7, %xmm6 - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 - - vpsrld $7, %xmm3, %xmm1 - vpsrld $7, %xmm7, %xmm5 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpsrld $2, %xmm1, %xmm1 - vpsrld $2, %xmm5, %xmm5 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - vpslld $2, %xmm2, %xmm2 - vpslld $2, %xmm6, %xmm6 - vpxor %xmm1, %xmm3, %xmm3 - vpxor %xmm5, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_avx_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 - - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vpslld $10, \r7, %xmm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 - vpslld $9, %xmm2, %xmm2 - vpsrld $9, %xmm1, %xmm1 - vpxor %xmm2, \r4, \r4 - vpxor %xmm1, \r4, \r4 - vpslld $11, %xmm2, %xmm2 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm - -.macro sha256_avx_main_quadround i - sha256_avx_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm #endif /* USE_AVX */ #if defined(USE_AVX2) -.macro sha256_avx2_extend_round i - vmovdqa (\i-15)*32(%rax), %ymm0 - vpslld $14, %ymm0, %ymm2 - vpsrld $3, %ymm0, %ymm0 - vpsrld $4, %ymm0, %ymm1 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpsrld $11, %ymm1, %ymm1 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm2, %ymm0, %ymm0 - vpaddd (\i-16)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpsrld $10, %ymm3, %ymm3 - vpsrld $7, %ymm3, %ymm1 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpsrld $2, %ymm1, %ymm1 - vpslld $2, %ymm2, %ymm2 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm2, %ymm3, %ymm3 - vpaddd %ymm0, %ymm3, %ymm3 - vmovdqa %ymm3, \i*32(%rax) -.endm - -.macro sha256_avx2_extend_doubleround i - vmovdqa (\i-15)*32(%rax), %ymm0 - vmovdqa (\i-14)*32(%rax), %ymm4 - vpslld $14, %ymm0, %ymm2 - vpslld $14, %ymm4, %ymm6 - vpsrld $3, %ymm0, %ymm8 - vpsrld $3, %ymm4, %ymm4 - vpsrld $7, %ymm0, %ymm1 - vpsrld $4, %ymm4, %ymm5 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpsrld $11, %ymm1, %ymm1 - vpsrld $11, %ymm5, %ymm5 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - vpslld $11, %ymm2, %ymm2 - vpslld $11, %ymm6, %ymm6 - vpxor %ymm1, %ymm8, %ymm8 - vpxor %ymm5, %ymm4, %ymm4 - vpxor %ymm2, %ymm8, %ymm8 - vpxor %ymm6, %ymm4, %ymm4 - - vpaddd %ymm0, %ymm4, %ymm4 - vpaddd (\i-16)*32(%rax), %ymm8, %ymm0 - - vpslld $13, %ymm3, %ymm2 - vpslld $13, %ymm7, %ymm6 - vpsrld $10, %ymm3, %ymm3 - vpsrld $10, %ymm7, %ymm7 - - vpaddd (\i-7)*32(%rax), %ymm0, %ymm0 - vpaddd (\i-6)*32(%rax), %ymm4, %ymm4 - - vpsrld $7, %ymm3, %ymm1 - vpsrld $7, %ymm7, %ymm5 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpsrld $2, %ymm1, %ymm1 - vpsrld $2, %ymm5, %ymm5 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - vpslld $2, %ymm2, %ymm2 - vpslld $2, %ymm6, %ymm6 - vpxor %ymm1, %ymm3, %ymm3 - vpxor %ymm5, %ymm7, %ymm7 - vpxor %ymm2, %ymm3, %ymm3 - vpxor %ymm6, %ymm7, %ymm7 - - vpaddd %ymm0, %ymm3, %ymm3 - vpaddd %ymm4, %ymm7, %ymm7 - vmovdqa %ymm3, \i*32(%rax) - vmovdqa %ymm7, (\i+1)*32(%rax) -.endm - -.macro sha256_avx2_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 32*(\i)(%rax), \r0, %ymm6 - vpaddd 32*(\i)(%rcx), %ymm6, %ymm6 - - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $14, %ymm1, %ymm1 - vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 - - vpand \r6, \r5, %ymm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %ymm1 - vpxor \r4, %ymm1, %ymm1 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - - vpslld $10, \r7, %ymm2 - vpsrld $2, \r7, \r4 - vpsrld $11, \r4, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $9, %ymm2, %ymm2 - vpsrld $9, %ymm1, %ymm1 - vpxor %ymm2, \r4, \r4 - vpxor %ymm1, \r4, \r4 - vpslld $11, %ymm2, %ymm2 - vpxor %ymm2, \r4, \r4 - vpaddd %ymm6, \r4, \r4 -.endm +#define sha256_avx2_extend_round(i) \ + vmovdqa (i-15)*32(%rax), %ymm0; \ + vpslld $14, %ymm0, %ymm2; \ + vpsrld $3, %ymm0, %ymm0; \ + vpsrld $4, %ymm0, %ymm1; \ + vpxor %ymm1, %ymm0, %ymm0; \ + vpxor %ymm2, %ymm0, %ymm0; \ + vpsrld $11, %ymm1, %ymm1; \ + vpslld $11, %ymm2, %ymm2; \ + vpxor %ymm1, %ymm0, %ymm0; \ + vpxor %ymm2, %ymm0, %ymm0; \ + vpaddd (i-16)*32(%rax), %ymm0, %ymm0; \ + vpaddd (i-7)*32(%rax), %ymm0, %ymm0; \ + vpslld $13, %ymm3, %ymm2; \ + vpsrld $10, %ymm3, %ymm3; \ + vpsrld $7, %ymm3, %ymm1; \ + vpxor %ymm1, %ymm3, %ymm3; \ + vpxor %ymm2, %ymm3, %ymm3; \ + vpsrld $2, %ymm1, %ymm1; \ + vpslld $2, %ymm2, %ymm2; \ + vpxor %ymm1, %ymm3, %ymm3; \ + vpxor %ymm2, %ymm3, %ymm3; \ + vpaddd %ymm0, %ymm3, %ymm3; \ + vmovdqa %ymm3, i*32(%rax); \ + + +#define sha256_avx2_extend_doubleround(i) \ + vmovdqa (i-15)*32(%rax), %ymm0; \ + vmovdqa (i-14)*32(%rax), %ymm4; \ + vpslld $14, %ymm0, %ymm2; \ + vpslld $14, %ymm4, %ymm6; \ + vpsrld $3, %ymm0, %ymm8; \ + vpsrld $3, %ymm4, %ymm4; \ + vpsrld $7, %ymm0, %ymm1; \ + vpsrld $4, %ymm4, %ymm5; \ + vpxor %ymm1, %ymm8, %ymm8; \ + vpxor %ymm5, %ymm4, %ymm4; \ + vpsrld $11, %ymm1, %ymm1; \ + vpsrld $11, %ymm5, %ymm5; \ + vpxor %ymm2, %ymm8, %ymm8; \ + vpxor %ymm6, %ymm4, %ymm4; \ + vpslld $11, %ymm2, %ymm2; \ + vpslld $11, %ymm6, %ymm6; \ + vpxor %ymm1, %ymm8, %ymm8; \ + vpxor %ymm5, %ymm4, %ymm4; \ + vpxor %ymm2, %ymm8, %ymm8; \ + vpxor %ymm6, %ymm4, %ymm4; \ + vpaddd %ymm0, %ymm4, %ymm4; \ + vpaddd (i-16)*32(%rax), %ymm8, %ymm0; \ + vpslld $13, %ymm3, %ymm2; \ + vpslld $13, %ymm7, %ymm6; \ + vpsrld $10, %ymm3, %ymm3; \ + vpsrld $10, %ymm7, %ymm7; \ + vpaddd (i-7)*32(%rax), %ymm0, %ymm0; \ + vpaddd (i-6)*32(%rax), %ymm4, %ymm4; \ + vpsrld $7, %ymm3, %ymm1; \ + vpsrld $7, %ymm7, %ymm5; \ + vpxor %ymm1, %ymm3, %ymm3; \ + vpxor %ymm5, %ymm7, %ymm7; \ + vpsrld $2, %ymm1, %ymm1; \ + vpsrld $2, %ymm5, %ymm5; \ + vpxor %ymm2, %ymm3, %ymm3; \ + vpxor %ymm6, %ymm7, %ymm7; \ + vpslld $2, %ymm2, %ymm2; \ + vpslld $2, %ymm6, %ymm6; \ + vpxor %ymm1, %ymm3, %ymm3; \ + vpxor %ymm5, %ymm7, %ymm7; \ + vpxor %ymm2, %ymm3, %ymm3; \ + vpxor %ymm6, %ymm7, %ymm7; \ + vpaddd %ymm0, %ymm3, %ymm3; \ + vpaddd %ymm4, %ymm7, %ymm7; \ + vmovdqa %ymm3, i*32(%rax); \ + vmovdqa %ymm7, (i+1)*32(%rax); \ + + +#define sha256_avx2_main_round(i, r0, r1, r2, r3, r4, r5, r6, r7) \ + vpaddd 32*(i)(%rax), r0, %ymm6; \ + vpaddd 32*(i)(%rcx), %ymm6, %ymm6; \ + vpandn r1, r3, %ymm1; \ + vpand r3, r2, %ymm2; \ + vpxor %ymm2, %ymm1, %ymm1; \ + vpaddd %ymm1, %ymm6, %ymm6; \ + vpslld $7, r3, %ymm1; \ + vpsrld $6, r3, r0; \ + vpsrld $5, r0, %ymm2; \ + vpxor %ymm1, r0, r0; \ + vpxor %ymm2, r0, r0; \ + vpslld $14, %ymm1, %ymm1; \ + vpsrld $14, %ymm2, %ymm2; \ + vpxor %ymm1, r0, r0; \ + vpxor %ymm2, r0, r0; \ + vpslld $5, %ymm1, %ymm1; \ + vpxor %ymm1, r0, r0; \ + vpaddd r0, %ymm6, %ymm6; \ + vpaddd %ymm6, r4, r0; \ + vpand r6, r5, %ymm2; \ + vpand r7, r5, r4; \ + vpand r7, r6, %ymm1; \ + vpxor r4, %ymm1, %ymm1; \ + vpxor %ymm2, %ymm1, %ymm1; \ + vpaddd %ymm1, %ymm6, %ymm6; \ + vpslld $10, r7, %ymm2; \ + vpsrld $2, r7, r4; \ + vpsrld $11, r4, %ymm1; \ + vpxor %ymm2, r4, r4; \ + vpxor %ymm1, r4, r4; \ + vpslld $9, %ymm2, %ymm2; \ + vpsrld $9, %ymm1, %ymm1; \ + vpxor %ymm2, r4, r4; \ + vpxor %ymm1, r4, r4; \ + vpslld $11, %ymm2, %ymm2; \ + vpxor %ymm2, r4, r4; \ + vpaddd %ymm6, r4, r4; \ + + +#define sha256_avx2_main_quadround(i) \ + sha256_avx2_main_round(i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7); \ + sha256_avx2_main_round(i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3); \ + sha256_avx2_main_round(i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4); \ + sha256_avx2_main_round(i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5); \ -.macro sha256_avx2_main_quadround i - sha256_avx2_main_round \i+0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round \i+1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round \i+2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round \i+3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 -.endm #endif /* USE_AVX2 */ #if defined(USE_XOP) -.macro sha256_xop_extend_round i - vmovdqa (\i-15)*16(%rax), %xmm0 - vprotd $25, %xmm0, %xmm1 - vprotd $14, %xmm0, %xmm2 - vpsrld $3, %xmm0, %xmm0 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm0, %xmm0 - - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 +#define sha256_xop_extend_round(i) \ + vmovdqa (i-15)*16(%rax), %xmm0; \ + vprotd $25, %xmm0, %xmm1; \ + vprotd $14, %xmm0, %xmm2; \ + vpsrld $3, %xmm0, %xmm0; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vpxor %xmm2, %xmm0, %xmm0; \ + vpaddd (i-16)*16(%rax), %xmm0, %xmm0; \ + vpaddd (i-7)*16(%rax), %xmm0, %xmm0; \ + vprotd $15, %xmm3, %xmm1; \ + vprotd $13, %xmm3, %xmm2; \ + vpsrld $10, %xmm3, %xmm3; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpaddd %xmm0, %xmm3, %xmm3; \ + vmovdqa %xmm3, i*16(%rax); \ + + +#define sha256_xop_extend_doubleround(i) \ + vmovdqa (i-15)*16(%rax), %xmm0; \ + vmovdqa (i-14)*16(%rax), %xmm4; \ + vprotd $25, %xmm0, %xmm1; \ + vprotd $25, %xmm4, %xmm5; \ + vprotd $14, %xmm0, %xmm2; \ + vprotd $14, %xmm4, %xmm6; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vpxor %xmm5, %xmm6, %xmm6; \ + vpsrld $3, %xmm0, %xmm0; \ + vpsrld $3, %xmm4, %xmm4; \ + vpxor %xmm2, %xmm0, %xmm0; \ + vpxor %xmm6, %xmm4, %xmm4; \ + vpaddd (i-16)*16(%rax), %xmm0, %xmm0; \ + vpaddd (i-15)*16(%rax), %xmm4, %xmm4; \ + vprotd $15, %xmm3, %xmm1; \ + vprotd $15, %xmm7, %xmm5; \ + vprotd $13, %xmm3, %xmm2; \ + vprotd $13, %xmm7, %xmm6; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vpxor %xmm5, %xmm6, %xmm6; \ + vpaddd (i-7)*16(%rax), %xmm0, %xmm0; \ + vpaddd (i-6)*16(%rax), %xmm4, %xmm4; \ + vpsrld $10, %xmm3, %xmm3; \ + vpsrld $10, %xmm7, %xmm7; \ + vpxor %xmm2, %xmm3, %xmm3; \ + vpxor %xmm6, %xmm7, %xmm7; \ + vpaddd %xmm0, %xmm3, %xmm3; \ + vpaddd %xmm4, %xmm7, %xmm7; \ + vmovdqa %xmm3, i*16(%rax); \ + vmovdqa %xmm7, (i+1)*16(%rax); \ + + +#define sha256_xop_main_round(i, r0, r1, r2, r3, r4, r5, r6, r7) \ + vpaddd 16*(i)(%rax), r0, %xmm6; \ + vpaddd 16*(i)(%rcx), %xmm6, %xmm6; \ + vpandn r1, r3, %xmm1; \ + vpand r3, r2, %xmm2; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vprotd $26, r3, %xmm1; \ + vprotd $21, r3, %xmm2; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vprotd $7, r3, r0; \ + vpxor %xmm2, r0, r0; \ + vpaddd r0, %xmm6, %xmm6; \ + vpaddd %xmm6, r4, r0; \ + vpand r6, r5, %xmm2; \ + vpand r7, r5, r4; \ + vpand r7, r6, %xmm1; \ + vpxor r4, %xmm1, %xmm1; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vprotd $30, r7, %xmm1; \ + vprotd $19, r7, %xmm2; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vprotd $10, r7, r4; \ + vpxor %xmm2, r4, r4; \ + vpaddd %xmm6, r4, r4; \ + + +#define sha256_xop_main_quadround(i) \ + sha256_xop_main_round(i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7); \ + sha256_xop_main_round(i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3); \ + sha256_xop_main_round(i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4); \ + sha256_xop_main_round(i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5); \ - vprotd $15, %xmm3, %xmm1 - vprotd $13, %xmm3, %xmm2 - vpsrld $10, %xmm3, %xmm3 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm2, %xmm3, %xmm3 - vpaddd %xmm0, %xmm3, %xmm3 - vmovdqa %xmm3, \i*16(%rax) -.endm - -.macro sha256_xop_extend_doubleround i - vmovdqa (\i-15)*16(%rax), %xmm0 - vmovdqa (\i-14)*16(%rax), %xmm4 - vprotd $25, %xmm0, %xmm1 - vprotd $25, %xmm4, %xmm5 - vprotd $14, %xmm0, %xmm2 - vprotd $14, %xmm4, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - vpsrld $3, %xmm0, %xmm0 - vpsrld $3, %xmm4, %xmm4 - vpxor %xmm2, %xmm0, %xmm0 - vpxor %xmm6, %xmm4, %xmm4 - - vpaddd (\i-16)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-15)*16(%rax), %xmm4, %xmm4 - - vprotd $15, %xmm3, %xmm1 - vprotd $15, %xmm7, %xmm5 - vprotd $13, %xmm3, %xmm2 - vprotd $13, %xmm7, %xmm6 - vpxor %xmm1, %xmm2, %xmm2 - vpxor %xmm5, %xmm6, %xmm6 - - vpaddd (\i-7)*16(%rax), %xmm0, %xmm0 - vpaddd (\i-6)*16(%rax), %xmm4, %xmm4 - - vpsrld $10, %xmm3, %xmm3 - vpsrld $10, %xmm7, %xmm7 - vpxor %xmm2, %xmm3, %xmm3 - vpxor %xmm6, %xmm7, %xmm7 - - vpaddd %xmm0, %xmm3, %xmm3 - vpaddd %xmm4, %xmm7, %xmm7 - vmovdqa %xmm3, \i*16(%rax) - vmovdqa %xmm7, (\i+1)*16(%rax) -.endm - -.macro sha256_xop_main_round i, r0, r1, r2, r3, r4, r5, r6, r7 - vpaddd 16*(\i)(%rax), \r0, %xmm6 - vpaddd 16*(\i)(%rcx), %xmm6, %xmm6 - - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 - - vpand \r6, \r5, %xmm2 - vpand \r7, \r5, \r4 - vpand \r7, \r6, %xmm1 - vpxor \r4, %xmm1, %xmm1 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - - vprotd $30, \r7, %xmm1 - vprotd $19, \r7, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $10, \r7, \r4 - vpxor %xmm2, \r4, \r4 - vpaddd %xmm6, \r4, \r4 -.endm - -.macro sha256_xop_main_quadround i - sha256_xop_main_round \i+0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round \i+1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round \i+2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round \i+3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 -.endm #endif /* USE_XOP */ @@ -969,30 +925,30 @@ sha256_transform_4way_core_avx: leaq 256(%rsp), %rax movdqa -2*16(%rax), %xmm3 movdqa -1*16(%rax), %xmm7 - sha256_avx_extend_doubleround 0 - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 - sha256_avx_extend_doubleround 6 - sha256_avx_extend_doubleround 8 - sha256_avx_extend_doubleround 10 - sha256_avx_extend_doubleround 12 - sha256_avx_extend_doubleround 14 - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 + sha256_avx_extend_doubleround(0) + sha256_avx_extend_doubleround(2) + sha256_avx_extend_doubleround(4) + sha256_avx_extend_doubleround(6) + sha256_avx_extend_doubleround(8) + sha256_avx_extend_doubleround(10) + sha256_avx_extend_doubleround(12) + sha256_avx_extend_doubleround(14) + sha256_avx_extend_doubleround(16) + sha256_avx_extend_doubleround(18) + sha256_avx_extend_doubleround(20) + sha256_avx_extend_doubleround(22) + sha256_avx_extend_doubleround(24) + sha256_avx_extend_doubleround(26) + sha256_avx_extend_doubleround(28) + sha256_avx_extend_doubleround(30) + sha256_avx_extend_doubleround(32) + sha256_avx_extend_doubleround(34) + sha256_avx_extend_doubleround(36) + sha256_avx_extend_doubleround(38) + sha256_avx_extend_doubleround(40) + sha256_avx_extend_doubleround(42) + sha256_avx_extend_doubleround(44) + sha256_avx_extend_doubleround(46) movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 movdqu 32(%rdi), %xmm4 @@ -1003,22 +959,22 @@ sha256_transform_4way_core_avx: movdqu 112(%rdi), %xmm10 movq %rsp, %rax leaq sha256_4k(%rip), %rcx - sha256_avx_main_quadround 0 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_quadround 56 - sha256_avx_main_quadround 60 + sha256_avx_main_quadround(0) + sha256_avx_main_quadround(4) + sha256_avx_main_quadround(8) + sha256_avx_main_quadround(12) + sha256_avx_main_quadround(16) + sha256_avx_main_quadround(20) + sha256_avx_main_quadround(24) + sha256_avx_main_quadround(28) + sha256_avx_main_quadround(32) + sha256_avx_main_quadround(36) + sha256_avx_main_quadround(40) + sha256_avx_main_quadround(44) + sha256_avx_main_quadround(48) + sha256_avx_main_quadround(52) + sha256_avx_main_quadround(56) + sha256_avx_main_quadround(60) jmp sha256_transform_4way_finish #endif /* USE_AVX */ @@ -1030,30 +986,30 @@ sha256_transform_4way_core_xop: leaq 256(%rsp), %rax movdqa -2*16(%rax), %xmm3 movdqa -1*16(%rax), %xmm7 - sha256_xop_extend_doubleround 0 - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 - sha256_xop_extend_doubleround 6 - sha256_xop_extend_doubleround 8 - sha256_xop_extend_doubleround 10 - sha256_xop_extend_doubleround 12 - sha256_xop_extend_doubleround 14 - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 + sha256_xop_extend_doubleround(0) + sha256_xop_extend_doubleround(2) + sha256_xop_extend_doubleround(4) + sha256_xop_extend_doubleround(6) + sha256_xop_extend_doubleround(8) + sha256_xop_extend_doubleround(10) + sha256_xop_extend_doubleround(12) + sha256_xop_extend_doubleround(14) + sha256_xop_extend_doubleround(16) + sha256_xop_extend_doubleround(18) + sha256_xop_extend_doubleround(20) + sha256_xop_extend_doubleround(22) + sha256_xop_extend_doubleround(24) + sha256_xop_extend_doubleround(26) + sha256_xop_extend_doubleround(28) + sha256_xop_extend_doubleround(30) + sha256_xop_extend_doubleround(32) + sha256_xop_extend_doubleround(34) + sha256_xop_extend_doubleround(36) + sha256_xop_extend_doubleround(38) + sha256_xop_extend_doubleround(40) + sha256_xop_extend_doubleround(42) + sha256_xop_extend_doubleround(44) + sha256_xop_extend_doubleround(46) movdqu 0(%rdi), %xmm7 movdqu 16(%rdi), %xmm5 movdqu 32(%rdi), %xmm4 @@ -1064,22 +1020,22 @@ sha256_transform_4way_core_xop: movdqu 112(%rdi), %xmm10 movq %rsp, %rax leaq sha256_4k(%rip), %rcx - sha256_xop_main_quadround 0 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_quadround 56 - sha256_xop_main_quadround 60 + sha256_xop_main_quadround(0) + sha256_xop_main_quadround(4) + sha256_xop_main_quadround(8) + sha256_xop_main_quadround(12) + sha256_xop_main_quadround(16) + sha256_xop_main_quadround(20) + sha256_xop_main_quadround(24) + sha256_xop_main_quadround(28) + sha256_xop_main_quadround(32) + sha256_xop_main_quadround(36) + sha256_xop_main_quadround(40) + sha256_xop_main_quadround(44) + sha256_xop_main_quadround(48) + sha256_xop_main_quadround(52) + sha256_xop_main_quadround(56) + sha256_xop_main_quadround(60) jmp sha256_transform_4way_finish #endif /* USE_XOP */ @@ -1089,24 +1045,24 @@ sha256_transform_4way_core_xop: sha256_transform_4way_core_addr: .quad 0x0 -.macro p2bswap_rsi_rsp i - movdqu \i*16(%rsi), %xmm0 - movdqu (\i+1)*16(%rsi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, \i*16(%rsp) - movdqa %xmm2, (\i+1)*16(%rsp) -.endm +#define p2bswap_rsi_rsp(i) \ + movdqu i*16(%rsi), %xmm0; \ + movdqu (i+1)*16(%rsi), %xmm2; \ + pshuflw $0xb1, %xmm0, %xmm0; \ + pshuflw $0xb1, %xmm2, %xmm2; \ + pshufhw $0xb1, %xmm0, %xmm0; \ + pshufhw $0xb1, %xmm2, %xmm2; \ + movdqa %xmm0, %xmm1; \ + movdqa %xmm2, %xmm3; \ + psrlw $8, %xmm1; \ + psrlw $8, %xmm3; \ + psllw $8, %xmm0; \ + psllw $8, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm3, %xmm2; \ + movdqa %xmm0, i*16(%rsp); \ + movdqa %xmm2, (i+1)*16(%rsp); \ + .text .p2align 6 @@ -1171,14 +1127,14 @@ _sha256_transform_4way: .p2align 6 sha256_transform_4way_swap: - p2bswap_rsi_rsp 0 - p2bswap_rsi_rsp 2 - p2bswap_rsi_rsp 4 - p2bswap_rsi_rsp 6 - p2bswap_rsi_rsp 8 - p2bswap_rsi_rsp 10 - p2bswap_rsi_rsp 12 - p2bswap_rsi_rsp 14 + p2bswap_rsi_rsp(0) + p2bswap_rsi_rsp(2) + p2bswap_rsi_rsp(4) + p2bswap_rsi_rsp(6) + p2bswap_rsi_rsp(8) + p2bswap_rsi_rsp(10) + p2bswap_rsi_rsp(12) + p2bswap_rsi_rsp(14) jmp *sha256_transform_4way_core_addr(%rip) .p2align 6 @@ -1232,30 +1188,30 @@ sha256_transform_8way_core_avx2: leaq 8*64(%rsp), %rax vmovdqa -2*32(%rax), %ymm3 vmovdqa -1*32(%rax), %ymm7 - sha256_avx2_extend_doubleround 0 - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 - sha256_avx2_extend_doubleround 6 - sha256_avx2_extend_doubleround 8 - sha256_avx2_extend_doubleround 10 - sha256_avx2_extend_doubleround 12 - sha256_avx2_extend_doubleround 14 - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 + sha256_avx2_extend_doubleround(0) + sha256_avx2_extend_doubleround(2) + sha256_avx2_extend_doubleround(4) + sha256_avx2_extend_doubleround(6) + sha256_avx2_extend_doubleround(8) + sha256_avx2_extend_doubleround(10) + sha256_avx2_extend_doubleround(12) + sha256_avx2_extend_doubleround(14) + sha256_avx2_extend_doubleround(16) + sha256_avx2_extend_doubleround(18) + sha256_avx2_extend_doubleround(20) + sha256_avx2_extend_doubleround(22) + sha256_avx2_extend_doubleround(24) + sha256_avx2_extend_doubleround(26) + sha256_avx2_extend_doubleround(28) + sha256_avx2_extend_doubleround(30) + sha256_avx2_extend_doubleround(32) + sha256_avx2_extend_doubleround(34) + sha256_avx2_extend_doubleround(36) + sha256_avx2_extend_doubleround(38) + sha256_avx2_extend_doubleround(40) + sha256_avx2_extend_doubleround(42) + sha256_avx2_extend_doubleround(44) + sha256_avx2_extend_doubleround(46) vmovdqu 0*32(%rdi), %ymm7 vmovdqu 1*32(%rdi), %ymm5 vmovdqu 2*32(%rdi), %ymm4 @@ -1266,40 +1222,40 @@ sha256_transform_8way_core_avx2: vmovdqu 7*32(%rdi), %ymm10 movq %rsp, %rax leaq sha256_8k(%rip), %rcx - sha256_avx2_main_quadround 0 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_quadround 56 - sha256_avx2_main_quadround 60 + sha256_avx2_main_quadround(0) + sha256_avx2_main_quadround(4) + sha256_avx2_main_quadround(8) + sha256_avx2_main_quadround(12) + sha256_avx2_main_quadround(16) + sha256_avx2_main_quadround(20) + sha256_avx2_main_quadround(24) + sha256_avx2_main_quadround(28) + sha256_avx2_main_quadround(32) + sha256_avx2_main_quadround(36) + sha256_avx2_main_quadround(40) + sha256_avx2_main_quadround(44) + sha256_avx2_main_quadround(48) + sha256_avx2_main_quadround(52) + sha256_avx2_main_quadround(56) + sha256_avx2_main_quadround(60) jmp sha256_transform_8way_finish -.macro p2bswap_avx2_rsi_rsp i - vmovdqu \i*32(%rsi), %ymm0 - vmovdqu (\i+1)*32(%rsi), %ymm2 - vpshuflw $0xb1, %ymm0, %ymm0 - vpshuflw $0xb1, %ymm2, %ymm2 - vpshufhw $0xb1, %ymm0, %ymm0 - vpshufhw $0xb1, %ymm2, %ymm2 - vpsrlw $8, %ymm0, %ymm1 - vpsrlw $8, %ymm2, %ymm3 - vpsllw $8, %ymm0, %ymm0 - vpsllw $8, %ymm2, %ymm2 - vpxor %ymm1, %ymm0, %ymm0 - vpxor %ymm3, %ymm2, %ymm2 - vmovdqa %ymm0, \i*32(%rsp) - vmovdqa %ymm2, (\i+1)*32(%rsp) -.endm +#define p2bswap_avx2_rsi_rsp(i) \ + vmovdqu i*32(%rsi), %ymm0; \ + vmovdqu (i+1)*32(%rsi), %ymm2; \ + vpshuflw $0xb1, %ymm0, %ymm0; \ + vpshuflw $0xb1, %ymm2, %ymm2; \ + vpshufhw $0xb1, %ymm0, %ymm0; \ + vpshufhw $0xb1, %ymm2, %ymm2; \ + vpsrlw $8, %ymm0, %ymm1; \ + vpsrlw $8, %ymm2, %ymm3; \ + vpsllw $8, %ymm0, %ymm0; \ + vpsllw $8, %ymm2, %ymm2; \ + vpxor %ymm1, %ymm0, %ymm0; \ + vpxor %ymm3, %ymm2, %ymm2; \ + vmovdqa %ymm0, i*32(%rsp); \ + vmovdqa %ymm2, (i+1)*32(%rsp); \ + .text .p2align 6 @@ -1364,14 +1320,14 @@ _sha256_transform_8way: .p2align 6 sha256_transform_8way_swap: - p2bswap_avx2_rsi_rsp 0 - p2bswap_avx2_rsi_rsp 2 - p2bswap_avx2_rsi_rsp 4 - p2bswap_avx2_rsi_rsp 6 - p2bswap_avx2_rsi_rsp 8 - p2bswap_avx2_rsi_rsp 10 - p2bswap_avx2_rsi_rsp 12 - p2bswap_avx2_rsi_rsp 14 + p2bswap_avx2_rsi_rsp(0) + p2bswap_avx2_rsi_rsp(2) + p2bswap_avx2_rsi_rsp(4) + p2bswap_avx2_rsi_rsp(6) + p2bswap_avx2_rsi_rsp(8) + p2bswap_avx2_rsi_rsp(10) + p2bswap_avx2_rsi_rsp(12) + p2bswap_avx2_rsi_rsp(14) jmp sha256_transform_8way_core_avx2 .p2align 6 @@ -1649,23 +1605,23 @@ sha256d_ms_4way_sse2_extend_loop1: movdqa %xmm7, 15*16(%rax) sha256d_ms_4way_sse2_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 + sha256_sse2_extend_doubleround(16) + sha256_sse2_extend_doubleround(18) + sha256_sse2_extend_doubleround(20) + sha256_sse2_extend_doubleround(22) + sha256_sse2_extend_doubleround(24) + sha256_sse2_extend_doubleround(26) + sha256_sse2_extend_doubleround(28) + sha256_sse2_extend_doubleround(30) + sha256_sse2_extend_doubleround(32) + sha256_sse2_extend_doubleround(34) + sha256_sse2_extend_doubleround(36) + sha256_sse2_extend_doubleround(38) + sha256_sse2_extend_doubleround(40) + sha256_sse2_extend_doubleround(42) jz sha256d_ms_4way_sse2_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 + sha256_sse2_extend_doubleround(44) + sha256_sse2_extend_doubleround(46) movdqa 0(%rcx), %xmm3 movdqa 16(%rcx), %xmm0 @@ -1684,30 +1640,30 @@ sha256d_ms_4way_sse2_extend_loop2: jmp sha256d_ms_4way_sse2_main_loop1 sha256d_ms_4way_sse2_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 + sha256_sse2_main_round(0) + sha256_sse2_main_round(1) + sha256_sse2_main_round(2) sha256d_ms_4way_sse2_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 + sha256_sse2_main_round(3) + sha256_sse2_main_quadround(4) + sha256_sse2_main_quadround(8) + sha256_sse2_main_quadround(12) + sha256_sse2_main_quadround(16) + sha256_sse2_main_quadround(20) + sha256_sse2_main_quadround(24) + sha256_sse2_main_quadround(28) + sha256_sse2_main_quadround(32) + sha256_sse2_main_quadround(36) + sha256_sse2_main_quadround(40) + sha256_sse2_main_quadround(44) + sha256_sse2_main_quadround(48) + sha256_sse2_main_quadround(52) + sha256_sse2_main_round(56) jz sha256d_ms_4way_sse2_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 + sha256_sse2_main_round(57) + sha256_sse2_main_round(58) + sha256_sse2_main_round(59) + sha256_sse2_main_quadround(60) movdqa 5*16(%rsp), %xmm1 movdqa 6*16(%rsp), %xmm2 @@ -1796,8 +1752,8 @@ sha256d_ms_4way_sse2_main_loop1: movdqa %xmm3, 0*16(%rax) movdqa %xmm7, 1*16(%rax) - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 + sha256_sse2_extend_doubleround(2) + sha256_sse2_extend_doubleround(4) movdqa -9*16(%rax), %xmm0 movdqa sha256d_4preext2_23(%rip), %xmm4 @@ -1972,7 +1928,7 @@ sha256d_ms_4way_sse2_main_loop1: jmp sha256d_ms_4way_sse2_extend_loop2 sha256d_ms_4way_sse2_extend_coda2: - sha256_sse2_extend_round 44 + sha256_sse2_extend_round(44) movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -1990,42 +1946,42 @@ sha256d_ms_4way_sse2_extend_coda2: leaq sha256_4k(%rip), %rcx jmp sha256d_ms_4way_sse2_main_loop2 -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*\i(%rax), %xmm6 - paddd 16*\i(%rcx), %xmm6 - paddd 32(%rsp), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%rsp), %xmm2 - paddd \r7, %xmm6 - pandn %xmm2, %xmm1 - movdqa %xmm2, 32(%rsp) - movdqa 0(%rsp), %xmm2 - movdqa %xmm2, 16(%rsp) - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%rsp) - paddd %xmm1, %xmm6 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm6, %xmm0 -.endm +#define sha256_sse2_main_round_red(i, r7) \ + movdqa 16*i(%rax), %xmm6; \ + paddd 16*i(%rcx), %xmm6; \ + paddd 32(%rsp), %xmm6; \ + movdqa %xmm0, %xmm1; \ + movdqa 16(%rsp), %xmm2; \ + paddd r7, %xmm6; \ + pandn %xmm2, %xmm1; \ + movdqa %xmm2, 32(%rsp); \ + movdqa 0(%rsp), %xmm2; \ + movdqa %xmm2, 16(%rsp); \ + pand %xmm0, %xmm2; \ + pxor %xmm2, %xmm1; \ + movdqa %xmm0, 0(%rsp); \ + paddd %xmm1, %xmm6; \ + movdqa %xmm0, %xmm1; \ + psrld $6, %xmm0; \ + movdqa %xmm0, %xmm2; \ + pslld $7, %xmm1; \ + psrld $5, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $14, %xmm1; \ + psrld $14, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $5, %xmm1; \ + pxor %xmm1, %xmm0; \ + paddd %xmm6, %xmm0; \ + sha256d_ms_4way_sse2_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 + sha256_sse2_main_round_red(57, %xmm3) + sha256_sse2_main_round_red(58, %xmm4) + sha256_sse2_main_round_red(59, %xmm5) + sha256_sse2_main_round_red(60, %xmm7) paddd sha256_4h+112(%rip), %xmm0 movdqa %xmm0, 112(%rdi) @@ -2236,23 +2192,23 @@ sha256d_ms_4way_avx_extend_loop1: vmovdqa %xmm7, 15*16(%rax) sha256d_ms_4way_avx_extend_loop2: - sha256_avx_extend_doubleround 16 - sha256_avx_extend_doubleround 18 - sha256_avx_extend_doubleround 20 - sha256_avx_extend_doubleround 22 - sha256_avx_extend_doubleround 24 - sha256_avx_extend_doubleround 26 - sha256_avx_extend_doubleround 28 - sha256_avx_extend_doubleround 30 - sha256_avx_extend_doubleround 32 - sha256_avx_extend_doubleround 34 - sha256_avx_extend_doubleround 36 - sha256_avx_extend_doubleround 38 - sha256_avx_extend_doubleround 40 - sha256_avx_extend_doubleround 42 + sha256_avx_extend_doubleround(16) + sha256_avx_extend_doubleround(18) + sha256_avx_extend_doubleround(20) + sha256_avx_extend_doubleround(22) + sha256_avx_extend_doubleround(24) + sha256_avx_extend_doubleround(26) + sha256_avx_extend_doubleround(28) + sha256_avx_extend_doubleround(30) + sha256_avx_extend_doubleround(32) + sha256_avx_extend_doubleround(34) + sha256_avx_extend_doubleround(36) + sha256_avx_extend_doubleround(38) + sha256_avx_extend_doubleround(40) + sha256_avx_extend_doubleround(42) jz sha256d_ms_4way_avx_extend_coda2 - sha256_avx_extend_doubleround 44 - sha256_avx_extend_doubleround 46 + sha256_avx_extend_doubleround(44) + sha256_avx_extend_doubleround(46) movdqa 0(%rcx), %xmm7 movdqa 16(%rcx), %xmm8 @@ -2268,30 +2224,30 @@ sha256d_ms_4way_avx_extend_loop2: jmp sha256d_ms_4way_avx_main_loop1 sha256d_ms_4way_avx_main_loop2: - sha256_avx_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_avx_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_avx_main_round(0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7) + sha256_avx_main_round(1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3) + sha256_avx_main_round(2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4) sha256d_ms_4way_avx_main_loop1: - sha256_avx_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 4 - sha256_avx_main_quadround 8 - sha256_avx_main_quadround 12 - sha256_avx_main_quadround 16 - sha256_avx_main_quadround 20 - sha256_avx_main_quadround 24 - sha256_avx_main_quadround 28 - sha256_avx_main_quadround 32 - sha256_avx_main_quadround 36 - sha256_avx_main_quadround 40 - sha256_avx_main_quadround 44 - sha256_avx_main_quadround 48 - sha256_avx_main_quadround 52 - sha256_avx_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_avx_main_round(3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5) + sha256_avx_main_quadround(4) + sha256_avx_main_quadround(8) + sha256_avx_main_quadround(12) + sha256_avx_main_quadround(16) + sha256_avx_main_quadround(20) + sha256_avx_main_quadround(24) + sha256_avx_main_quadround(28) + sha256_avx_main_quadround(32) + sha256_avx_main_quadround(36) + sha256_avx_main_quadround(40) + sha256_avx_main_quadround(44) + sha256_avx_main_quadround(48) + sha256_avx_main_quadround(52) + sha256_avx_main_round(56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7) jz sha256d_ms_4way_avx_finish - sha256_avx_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_avx_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_avx_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_avx_main_quadround 60 + sha256_avx_main_round(57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3) + sha256_avx_main_round(58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4) + sha256_avx_main_round(59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5) + sha256_avx_main_quadround(60) movdqa 2*16(%rsp), %xmm1 movdqa 3*16(%rsp), %xmm2 @@ -2371,8 +2327,8 @@ sha256d_ms_4way_avx_main_loop1: vmovdqa %xmm3, 0*16(%rax) vmovdqa %xmm7, 1*16(%rax) - sha256_avx_extend_doubleround 2 - sha256_avx_extend_doubleround 4 + sha256_avx_extend_doubleround(2) + sha256_avx_extend_doubleround(4) vmovdqa -9*16(%rax), %xmm0 vpslld $14, %xmm0, %xmm2 @@ -2521,7 +2477,7 @@ sha256d_ms_4way_avx_main_loop1: jmp sha256d_ms_4way_avx_extend_loop2 sha256d_ms_4way_avx_extend_coda2: - sha256_avx_extend_round 44 + sha256_avx_extend_round(44) movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -2536,33 +2492,33 @@ sha256d_ms_4way_avx_extend_coda2: leaq sha256_4k(%rip), %rcx jmp sha256d_ms_4way_avx_main_loop2 -.macro sha256_avx_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vpslld $7, \r3, %xmm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $14, %xmm1, %xmm1 - vpsrld $14, %xmm2, %xmm2 - vpxor %xmm1, \r0, \r0 - vpxor %xmm2, \r0, \r0 - vpslld $5, %xmm1, %xmm1 - vpxor %xmm1, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm +#define sha256_avx_main_round_red(i, r0, r1, r2, r3, r4) \ + vpaddd 16*i(%rax), r0, %xmm6; \ + vpaddd 16*i(%rcx), %xmm6, %xmm6; \ + vpandn r1, r3, %xmm1; \ + vpand r3, r2, %xmm2; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vpslld $7, r3, %xmm1; \ + vpsrld $6, r3, r0; \ + vpsrld $5, r0, %xmm2; \ + vpxor %xmm1, r0, r0; \ + vpxor %xmm2, r0, r0; \ + vpslld $14, %xmm1, %xmm1; \ + vpsrld $14, %xmm2, %xmm2; \ + vpxor %xmm1, r0, r0; \ + vpxor %xmm2, r0, r0; \ + vpslld $5, %xmm1, %xmm1; \ + vpxor %xmm1, r0, r0; \ + vpaddd r0, %xmm6, %xmm6; \ + vpaddd %xmm6, r4, r0; \ + sha256d_ms_4way_avx_finish: - sha256_avx_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_avx_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_avx_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_avx_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + sha256_avx_main_round_red(57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4) + sha256_avx_main_round_red(58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5) + sha256_avx_main_round_red(59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7) + sha256_avx_main_round_red(60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3) paddd sha256_4h+112(%rip), %xmm10 movdqa %xmm10, 112(%rdi) @@ -2726,23 +2682,23 @@ sha256d_ms_4way_xop_extend_loop1: vmovdqa %xmm7, 15*16(%rax) sha256d_ms_4way_xop_extend_loop2: - sha256_xop_extend_doubleround 16 - sha256_xop_extend_doubleround 18 - sha256_xop_extend_doubleround 20 - sha256_xop_extend_doubleround 22 - sha256_xop_extend_doubleround 24 - sha256_xop_extend_doubleround 26 - sha256_xop_extend_doubleround 28 - sha256_xop_extend_doubleround 30 - sha256_xop_extend_doubleround 32 - sha256_xop_extend_doubleround 34 - sha256_xop_extend_doubleround 36 - sha256_xop_extend_doubleround 38 - sha256_xop_extend_doubleround 40 - sha256_xop_extend_doubleround 42 + sha256_xop_extend_doubleround(16) + sha256_xop_extend_doubleround(18) + sha256_xop_extend_doubleround(20) + sha256_xop_extend_doubleround(22) + sha256_xop_extend_doubleround(24) + sha256_xop_extend_doubleround(26) + sha256_xop_extend_doubleround(28) + sha256_xop_extend_doubleround(30) + sha256_xop_extend_doubleround(32) + sha256_xop_extend_doubleround(34) + sha256_xop_extend_doubleround(36) + sha256_xop_extend_doubleround(38) + sha256_xop_extend_doubleround(40) + sha256_xop_extend_doubleround(42) jz sha256d_ms_4way_xop_extend_coda2 - sha256_xop_extend_doubleround 44 - sha256_xop_extend_doubleround 46 + sha256_xop_extend_doubleround(44) + sha256_xop_extend_doubleround(46) movdqa 0(%rcx), %xmm7 movdqa 16(%rcx), %xmm8 @@ -2758,30 +2714,30 @@ sha256d_ms_4way_xop_extend_loop2: jmp sha256d_ms_4way_xop_main_loop1 sha256d_ms_4way_xop_main_loop2: - sha256_xop_main_round 0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 - sha256_xop_main_round 1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 + sha256_xop_main_round(0, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7) + sha256_xop_main_round(1, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3) + sha256_xop_main_round(2, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4) sha256d_ms_4way_xop_main_loop1: - sha256_xop_main_round 3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 4 - sha256_xop_main_quadround 8 - sha256_xop_main_quadround 12 - sha256_xop_main_quadround 16 - sha256_xop_main_quadround 20 - sha256_xop_main_quadround 24 - sha256_xop_main_quadround 28 - sha256_xop_main_quadround 32 - sha256_xop_main_quadround 36 - sha256_xop_main_quadround 40 - sha256_xop_main_quadround 44 - sha256_xop_main_quadround 48 - sha256_xop_main_quadround 52 - sha256_xop_main_round 56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7 + sha256_xop_main_round(3, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5) + sha256_xop_main_quadround(4) + sha256_xop_main_quadround(8) + sha256_xop_main_quadround(12) + sha256_xop_main_quadround(16) + sha256_xop_main_quadround(20) + sha256_xop_main_quadround(24) + sha256_xop_main_quadround(28) + sha256_xop_main_quadround(32) + sha256_xop_main_quadround(36) + sha256_xop_main_quadround(40) + sha256_xop_main_quadround(44) + sha256_xop_main_quadround(48) + sha256_xop_main_quadround(52) + sha256_xop_main_round(56, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3, %xmm4, %xmm5, %xmm7) jz sha256d_ms_4way_xop_finish - sha256_xop_main_round 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3 - sha256_xop_main_round 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4 - sha256_xop_main_round 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5 - sha256_xop_main_quadround 60 + sha256_xop_main_round(57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4, %xmm5, %xmm7, %xmm3) + sha256_xop_main_round(58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5, %xmm7, %xmm3, %xmm4) + sha256_xop_main_round(59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7, %xmm3, %xmm4, %xmm5) + sha256_xop_main_quadround(60) movdqa 2*16(%rsp), %xmm1 movdqa 3*16(%rsp), %xmm2 @@ -2853,8 +2809,8 @@ sha256d_ms_4way_xop_main_loop1: vmovdqa %xmm3, 0*16(%rax) vmovdqa %xmm7, 1*16(%rax) - sha256_xop_extend_doubleround 2 - sha256_xop_extend_doubleround 4 + sha256_xop_extend_doubleround(2) + sha256_xop_extend_doubleround(4) vmovdqa -9*16(%rax), %xmm0 vprotd $25, %xmm0, %xmm1 @@ -2955,7 +2911,7 @@ sha256d_ms_4way_xop_main_loop1: jmp sha256d_ms_4way_xop_extend_loop2 sha256d_ms_4way_xop_extend_coda2: - sha256_xop_extend_round 44 + sha256_xop_extend_round(44) movdqa sha256_4h+0(%rip), %xmm7 movdqa sha256_4h+16(%rip), %xmm5 @@ -2970,27 +2926,27 @@ sha256d_ms_4way_xop_extend_coda2: leaq sha256_4k(%rip), %rcx jmp sha256d_ms_4way_xop_main_loop2 -.macro sha256_xop_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 16*\i(%rax), \r0, %xmm6 - vpaddd 16*\i(%rcx), %xmm6, %xmm6 - vpandn \r1, \r3, %xmm1 - vpand \r3, \r2, %xmm2 - vpxor %xmm2, %xmm1, %xmm1 - vpaddd %xmm1, %xmm6, %xmm6 - vprotd $26, \r3, %xmm1 - vprotd $21, \r3, %xmm2 - vpxor %xmm1, %xmm2, %xmm2 - vprotd $7, \r3, \r0 - vpxor %xmm2, \r0, \r0 - vpaddd \r0, %xmm6, %xmm6 - vpaddd %xmm6, \r4, \r0 -.endm +#define sha256_xop_main_round_red(i, r0, r1, r2, r3, r4) \ + vpaddd 16*i(%rax), r0, %xmm6; \ + vpaddd 16*i(%rcx), %xmm6, %xmm6; \ + vpandn r1, r3, %xmm1; \ + vpand r3, r2, %xmm2; \ + vpxor %xmm2, %xmm1, %xmm1; \ + vpaddd %xmm1, %xmm6, %xmm6; \ + vprotd $26, r3, %xmm1; \ + vprotd $21, r3, %xmm2; \ + vpxor %xmm1, %xmm2, %xmm2; \ + vprotd $7, r3, r0; \ + vpxor %xmm2, r0, r0; \ + vpaddd r0, %xmm6, %xmm6; \ + vpaddd %xmm6, r4, r0; \ + sha256d_ms_4way_xop_finish: - sha256_xop_main_round_red 57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4 - sha256_xop_main_round_red 58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5 - sha256_xop_main_round_red 59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7 - sha256_xop_main_round_red 60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3 + sha256_xop_main_round_red(57, %xmm9, %xmm8, %xmm0, %xmm10, %xmm4) + sha256_xop_main_round_red(58, %xmm8, %xmm0, %xmm10, %xmm9, %xmm5) + sha256_xop_main_round_red(59, %xmm0, %xmm10, %xmm9, %xmm8, %xmm7) + sha256_xop_main_round_red(60, %xmm10, %xmm9, %xmm8, %xmm0, %xmm3) paddd sha256_4h+112(%rip), %xmm10 movdqa %xmm10, 112(%rdi) @@ -3270,23 +3226,23 @@ sha256d_ms_8way_avx2_extend_loop1: vmovdqa %ymm7, 15*32(%rax) sha256d_ms_8way_avx2_extend_loop2: - sha256_avx2_extend_doubleround 16 - sha256_avx2_extend_doubleround 18 - sha256_avx2_extend_doubleround 20 - sha256_avx2_extend_doubleround 22 - sha256_avx2_extend_doubleround 24 - sha256_avx2_extend_doubleround 26 - sha256_avx2_extend_doubleround 28 - sha256_avx2_extend_doubleround 30 - sha256_avx2_extend_doubleround 32 - sha256_avx2_extend_doubleround 34 - sha256_avx2_extend_doubleround 36 - sha256_avx2_extend_doubleround 38 - sha256_avx2_extend_doubleround 40 - sha256_avx2_extend_doubleround 42 + sha256_avx2_extend_doubleround(16) + sha256_avx2_extend_doubleround(18) + sha256_avx2_extend_doubleround(20) + sha256_avx2_extend_doubleround(22) + sha256_avx2_extend_doubleround(24) + sha256_avx2_extend_doubleround(26) + sha256_avx2_extend_doubleround(28) + sha256_avx2_extend_doubleround(30) + sha256_avx2_extend_doubleround(32) + sha256_avx2_extend_doubleround(34) + sha256_avx2_extend_doubleround(36) + sha256_avx2_extend_doubleround(38) + sha256_avx2_extend_doubleround(40) + sha256_avx2_extend_doubleround(42) jz sha256d_ms_8way_avx2_extend_coda2 - sha256_avx2_extend_doubleround 44 - sha256_avx2_extend_doubleround 46 + sha256_avx2_extend_doubleround(44) + sha256_avx2_extend_doubleround(46) vmovdqa 0(%rcx), %ymm7 vmovdqa 32(%rcx), %ymm8 @@ -3302,30 +3258,30 @@ sha256d_ms_8way_avx2_extend_loop2: jmp sha256d_ms_8way_avx2_main_loop1 sha256d_ms_8way_avx2_main_loop2: - sha256_avx2_main_round 0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 - sha256_avx2_main_round 1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 + sha256_avx2_main_round(0, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7) + sha256_avx2_main_round(1, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3) + sha256_avx2_main_round(2, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4) sha256d_ms_8way_avx2_main_loop1: - sha256_avx2_main_round 3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 4 - sha256_avx2_main_quadround 8 - sha256_avx2_main_quadround 12 - sha256_avx2_main_quadround 16 - sha256_avx2_main_quadround 20 - sha256_avx2_main_quadround 24 - sha256_avx2_main_quadround 28 - sha256_avx2_main_quadround 32 - sha256_avx2_main_quadround 36 - sha256_avx2_main_quadround 40 - sha256_avx2_main_quadround 44 - sha256_avx2_main_quadround 48 - sha256_avx2_main_quadround 52 - sha256_avx2_main_round 56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7 + sha256_avx2_main_round(3, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5) + sha256_avx2_main_quadround(4) + sha256_avx2_main_quadround(8) + sha256_avx2_main_quadround(12) + sha256_avx2_main_quadround(16) + sha256_avx2_main_quadround(20) + sha256_avx2_main_quadround(24) + sha256_avx2_main_quadround(28) + sha256_avx2_main_quadround(32) + sha256_avx2_main_quadround(36) + sha256_avx2_main_quadround(40) + sha256_avx2_main_quadround(44) + sha256_avx2_main_quadround(48) + sha256_avx2_main_quadround(52) + sha256_avx2_main_round(56, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3, %ymm4, %ymm5, %ymm7) jz sha256d_ms_8way_avx2_finish - sha256_avx2_main_round 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3 - sha256_avx2_main_round 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4 - sha256_avx2_main_round 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5 - sha256_avx2_main_quadround 60 + sha256_avx2_main_round(57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4, %ymm5, %ymm7, %ymm3) + sha256_avx2_main_round(58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5, %ymm7, %ymm3, %ymm4) + sha256_avx2_main_round(59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7, %ymm3, %ymm4, %ymm5) + sha256_avx2_main_quadround(60) vmovdqa 2*32(%rsp), %ymm1 vmovdqa 3*32(%rsp), %ymm2 @@ -3406,8 +3362,8 @@ sha256d_ms_8way_avx2_main_loop1: vmovdqa %ymm3, 0*32(%rax) vmovdqa %ymm7, 1*32(%rax) - sha256_avx2_extend_doubleround 2 - sha256_avx2_extend_doubleround 4 + sha256_avx2_extend_doubleround(2) + sha256_avx2_extend_doubleround(4) vmovdqa -9*32(%rax), %ymm0 vpslld $14, %ymm0, %ymm2 @@ -3556,7 +3512,7 @@ sha256d_ms_8way_avx2_main_loop1: jmp sha256d_ms_8way_avx2_extend_loop2 sha256d_ms_8way_avx2_extend_coda2: - sha256_avx2_extend_round 44 + sha256_avx2_extend_round(44) vmovdqa sha256_8h+0(%rip), %ymm7 vmovdqa sha256_8h+32(%rip), %ymm5 @@ -3571,33 +3527,33 @@ sha256d_ms_8way_avx2_extend_coda2: leaq sha256_8k(%rip), %rcx jmp sha256d_ms_8way_avx2_main_loop2 -.macro sha256_avx2_main_round_red i, r0, r1, r2, r3, r4 - vpaddd 32*\i(%rax), \r0, %ymm6 - vpaddd 32*\i(%rcx), %ymm6, %ymm6 - vpandn \r1, \r3, %ymm1 - vpand \r3, \r2, %ymm2 - vpxor %ymm2, %ymm1, %ymm1 - vpaddd %ymm1, %ymm6, %ymm6 - vpslld $7, \r3, %ymm1 - vpsrld $6, \r3, \r0 - vpsrld $5, \r0, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $14, %ymm1, %ymm1 - vpsrld $14, %ymm2, %ymm2 - vpxor %ymm1, \r0, \r0 - vpxor %ymm2, \r0, \r0 - vpslld $5, %ymm1, %ymm1 - vpxor %ymm1, \r0, \r0 - vpaddd \r0, %ymm6, %ymm6 - vpaddd %ymm6, \r4, \r0 -.endm +#define sha256_avx2_main_round_red(i, r0, r1, r2, r3, r4) \ + vpaddd 32*i(%rax), r0, %ymm6; \ + vpaddd 32*i(%rcx), %ymm6, %ymm6; \ + vpandn r1, r3, %ymm1; \ + vpand r3, r2, %ymm2; \ + vpxor %ymm2, %ymm1, %ymm1; \ + vpaddd %ymm1, %ymm6, %ymm6; \ + vpslld $7, r3, %ymm1; \ + vpsrld $6, r3, r0; \ + vpsrld $5, r0, %ymm2; \ + vpxor %ymm1, r0, r0; \ + vpxor %ymm2, r0, r0; \ + vpslld $14, %ymm1, %ymm1; \ + vpsrld $14, %ymm2, %ymm2; \ + vpxor %ymm1, r0, r0; \ + vpxor %ymm2, r0, r0; \ + vpslld $5, %ymm1, %ymm1; \ + vpxor %ymm1, r0, r0; \ + vpaddd r0, %ymm6, %ymm6; \ + vpaddd %ymm6, r4, r0; \ + sha256d_ms_8way_avx2_finish: - sha256_avx2_main_round_red 57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4 - sha256_avx2_main_round_red 58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5 - sha256_avx2_main_round_red 59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7 - sha256_avx2_main_round_red 60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3 + sha256_avx2_main_round_red(57, %ymm9, %ymm8, %ymm0, %ymm10, %ymm4) + sha256_avx2_main_round_red(58, %ymm8, %ymm0, %ymm10, %ymm9, %ymm5) + sha256_avx2_main_round_red(59, %ymm0, %ymm10, %ymm9, %ymm8, %ymm7) + sha256_avx2_main_round_red(60, %ymm10, %ymm9, %ymm8, %ymm0, %ymm3) vpaddd sha256_8h+224(%rip), %ymm10, %ymm10 vmovdqa %ymm10, 224(%rdi) diff --git a/sha2-x86.S b/sha2-x86.S index e2eb112a9..4b84ddb57 100644 --- a/sha2-x86.S +++ b/sha2-x86.S @@ -135,189 +135,176 @@ _sha256_init_4way: ret -.macro sha256_sse2_extend_round i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa %xmm0, %xmm2 - psrld $3, %xmm0 - movdqa %xmm0, %xmm1 - pslld $14, %xmm2 - psrld $4, %xmm1 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - psrld $11, %xmm1 - pslld $11, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-7)*16(%eax), %xmm0 +#define sha256_sse2_extend_round(i) \ + movdqa (i-15)*16(%eax), %xmm0; \ + movdqa %xmm0, %xmm2; \ + psrld $3, %xmm0; \ + movdqa %xmm0, %xmm1; \ + pslld $14, %xmm2; \ + psrld $4, %xmm1; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + psrld $11, %xmm1; \ + pslld $11, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + paddd (i-16)*16(%eax), %xmm0; \ + paddd (i-7)*16(%eax), %xmm0; \ + movdqa %xmm3, %xmm2; \ + psrld $10, %xmm3; \ + pslld $13, %xmm2; \ + movdqa %xmm3, %xmm1; \ + psrld $7, %xmm1; \ + pxor %xmm1, %xmm3; \ + pxor %xmm2, %xmm3; \ + psrld $2, %xmm1; \ + pslld $2, %xmm2; \ + pxor %xmm1, %xmm3; \ + pxor %xmm2, %xmm3; \ + paddd %xmm0, %xmm3; \ + movdqa %xmm3, i*16(%eax); \ - movdqa %xmm3, %xmm2 - psrld $10, %xmm3 - pslld $13, %xmm2 - movdqa %xmm3, %xmm1 - psrld $7, %xmm1 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - psrld $2, %xmm1 - pslld $2, %xmm2 - pxor %xmm1, %xmm3 - pxor %xmm2, %xmm3 - paddd %xmm0, %xmm3 - movdqa %xmm3, \i*16(%eax) -.endm -.macro sha256_sse2_extend_doubleround i - movdqa (\i-15)*16(%eax), %xmm0 - movdqa (\i-14)*16(%eax), %xmm4 - movdqa %xmm0, %xmm2 - movdqa %xmm4, %xmm6 - psrld $3, %xmm0 - psrld $3, %xmm4 - movdqa %xmm0, %xmm1 - movdqa %xmm4, %xmm5 - pslld $14, %xmm2 - pslld $14, %xmm6 - psrld $4, %xmm1 - psrld $4, %xmm5 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - psrld $11, %xmm1 - psrld $11, %xmm5 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 - pslld $11, %xmm2 - pslld $11, %xmm6 - pxor %xmm1, %xmm0 - pxor %xmm5, %xmm4 - pxor %xmm2, %xmm0 - pxor %xmm6, %xmm4 +#define sha256_sse2_extend_doubleround(i) \ + movdqa (i-15)*16(%eax), %xmm0; \ + movdqa (i-14)*16(%eax), %xmm4; \ + movdqa %xmm0, %xmm2; \ + movdqa %xmm4, %xmm6; \ + psrld $3, %xmm0; \ + psrld $3, %xmm4; \ + movdqa %xmm0, %xmm1; \ + movdqa %xmm4, %xmm5; \ + pslld $14, %xmm2; \ + pslld $14, %xmm6; \ + psrld $4, %xmm1; \ + psrld $4, %xmm5; \ + pxor %xmm1, %xmm0; \ + pxor %xmm5, %xmm4; \ + psrld $11, %xmm1; \ + psrld $11, %xmm5; \ + pxor %xmm2, %xmm0; \ + pxor %xmm6, %xmm4; \ + pslld $11, %xmm2; \ + pslld $11, %xmm6; \ + pxor %xmm1, %xmm0; \ + pxor %xmm5, %xmm4; \ + pxor %xmm2, %xmm0; \ + pxor %xmm6, %xmm4; \ + paddd (i-16)*16(%eax), %xmm0; \ + paddd (i-15)*16(%eax), %xmm4; \ + movdqa %xmm3, %xmm2; \ + movdqa %xmm7, %xmm6; \ + psrld $10, %xmm3; \ + psrld $10, %xmm7; \ + movdqa %xmm3, %xmm1; \ + movdqa %xmm7, %xmm5; \ + pslld $13, %xmm2; \ + pslld $13, %xmm6; \ + psrld $7, %xmm1; \ + psrld $7, %xmm5; \ + paddd (i-7)*16(%eax), %xmm0; \ + paddd (i-6)*16(%eax), %xmm4; \ + pxor %xmm1, %xmm3; \ + pxor %xmm5, %xmm7; \ + psrld $2, %xmm1; \ + psrld $2, %xmm5; \ + pxor %xmm2, %xmm3; \ + pxor %xmm6, %xmm7; \ + pslld $2, %xmm2; \ + pslld $2, %xmm6; \ + pxor %xmm1, %xmm3; \ + pxor %xmm5, %xmm7; \ + pxor %xmm2, %xmm3; \ + pxor %xmm6, %xmm7; \ + paddd %xmm0, %xmm3; \ + paddd %xmm4, %xmm7; \ + movdqa %xmm3, i*16(%eax); \ + movdqa %xmm7, (i+1)*16(%eax); \ - paddd (\i-16)*16(%eax), %xmm0 - paddd (\i-15)*16(%eax), %xmm4 - movdqa %xmm3, %xmm2 - movdqa %xmm7, %xmm6 - psrld $10, %xmm3 - psrld $10, %xmm7 - movdqa %xmm3, %xmm1 - movdqa %xmm7, %xmm5 - pslld $13, %xmm2 - pslld $13, %xmm6 - psrld $7, %xmm1 - psrld $7, %xmm5 +#define sha256_sse2_main_round(i) \ + movdqa 16*(i)(%eax), %xmm6; \ + movdqa %xmm0, %xmm1; \ + movdqa 16(%esp), %xmm2; \ + pandn %xmm2, %xmm1; \ + paddd 32(%esp), %xmm6; \ + movdqa %xmm2, 32(%esp); \ + movdqa 0(%esp), %xmm2; \ + movdqa %xmm2, 16(%esp); \ + pand %xmm0, %xmm2; \ + pxor %xmm2, %xmm1; \ + movdqa %xmm0, 0(%esp); \ + paddd %xmm1, %xmm6; \ + movdqa %xmm0, %xmm1; \ + psrld $6, %xmm0; \ + paddd 16*(i)+sha256_4k, %xmm6; \ + movdqa %xmm0, %xmm2; \ + pslld $7, %xmm1; \ + psrld $5, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $14, %xmm1; \ + psrld $14, %xmm2; \ + pxor %xmm1, %xmm0; \ + pslld $5, %xmm1; \ + pxor %xmm2, %xmm0; \ + pxor %xmm1, %xmm0; \ + movdqa %xmm5, %xmm1; \ + paddd %xmm0, %xmm6; \ + movdqa %xmm3, %xmm0; \ + movdqa %xmm4, %xmm3; \ + movdqa %xmm4, %xmm2; \ + paddd %xmm6, %xmm0; \ + pand %xmm5, %xmm2; \ + pand %xmm7, %xmm1; \ + pand %xmm7, %xmm4; \ + pxor %xmm4, %xmm1; \ + movdqa %xmm5, %xmm4; \ + movdqa %xmm7, %xmm5; \ + pxor %xmm2, %xmm1; \ + paddd %xmm1, %xmm6; \ + movdqa %xmm7, %xmm2; \ + psrld $2, %xmm7; \ + movdqa %xmm7, %xmm1; \ + pslld $10, %xmm2; \ + psrld $11, %xmm1; \ + pxor %xmm2, %xmm7; \ + pslld $9, %xmm2; \ + pxor %xmm1, %xmm7; \ + psrld $9, %xmm1; \ + pxor %xmm2, %xmm7; \ + pslld $11, %xmm2; \ + pxor %xmm1, %xmm7; \ + pxor %xmm2, %xmm7; \ + paddd %xmm6, %xmm7; \ - paddd (\i-7)*16(%eax), %xmm0 - paddd (\i-6)*16(%eax), %xmm4 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - psrld $2, %xmm1 - psrld $2, %xmm5 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 - pslld $2, %xmm2 - pslld $2, %xmm6 - pxor %xmm1, %xmm3 - pxor %xmm5, %xmm7 - pxor %xmm2, %xmm3 - pxor %xmm6, %xmm7 +#define sha256_sse2_main_quadround(i) \ + sha256_sse2_main_round(i+0); \ + sha256_sse2_main_round(i+1); \ + sha256_sse2_main_round(i+2); \ + sha256_sse2_main_round(i+3); \ - paddd %xmm0, %xmm3 - paddd %xmm4, %xmm7 - movdqa %xmm3, \i*16(%eax) - movdqa %xmm7, (\i+1)*16(%eax) -.endm -.macro sha256_sse2_main_round i - movdqa 16*(\i)(%eax), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - pandn %xmm2, %xmm1 - paddd 32(%esp), %xmm6 +#define p2bswap_esi_esp(i) \ + movdqu i*16(%esi), %xmm0; \ + movdqu (i+1)*16(%esi), %xmm2; \ + pshuflw $0xb1, %xmm0, %xmm0; \ + pshuflw $0xb1, %xmm2, %xmm2; \ + pshufhw $0xb1, %xmm0, %xmm0; \ + pshufhw $0xb1, %xmm2, %xmm2; \ + movdqa %xmm0, %xmm1; \ + movdqa %xmm2, %xmm3; \ + psrlw $8, %xmm1; \ + psrlw $8, %xmm3; \ + psllw $8, %xmm0; \ + psllw $8, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm3, %xmm2; \ + movdqa %xmm0, (i+3)*16(%esp); \ + movdqa %xmm2, (i+4)*16(%esp); \ - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - - paddd %xmm1, %xmm6 - - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - paddd 16*(\i)+sha256_4k, %xmm6 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pslld $5, %xmm1 - pxor %xmm2, %xmm0 - pxor %xmm1, %xmm0 - movdqa %xmm5, %xmm1 - paddd %xmm0, %xmm6 - - movdqa %xmm3, %xmm0 - movdqa %xmm4, %xmm3 - movdqa %xmm4, %xmm2 - paddd %xmm6, %xmm0 - pand %xmm5, %xmm2 - pand %xmm7, %xmm1 - pand %xmm7, %xmm4 - pxor %xmm4, %xmm1 - movdqa %xmm5, %xmm4 - movdqa %xmm7, %xmm5 - pxor %xmm2, %xmm1 - paddd %xmm1, %xmm6 - - movdqa %xmm7, %xmm2 - psrld $2, %xmm7 - movdqa %xmm7, %xmm1 - pslld $10, %xmm2 - psrld $11, %xmm1 - pxor %xmm2, %xmm7 - pslld $9, %xmm2 - pxor %xmm1, %xmm7 - psrld $9, %xmm1 - pxor %xmm2, %xmm7 - pslld $11, %xmm2 - pxor %xmm1, %xmm7 - pxor %xmm2, %xmm7 - paddd %xmm6, %xmm7 -.endm - -.macro sha256_sse2_main_quadround i - sha256_sse2_main_round \i+0 - sha256_sse2_main_round \i+1 - sha256_sse2_main_round \i+2 - sha256_sse2_main_round \i+3 -.endm - - -.macro p2bswap_esi_esp i - movdqu \i*16(%esi), %xmm0 - movdqu (\i+1)*16(%esi), %xmm2 - pshuflw $0xb1, %xmm0, %xmm0 - pshuflw $0xb1, %xmm2, %xmm2 - pshufhw $0xb1, %xmm0, %xmm0 - pshufhw $0xb1, %xmm2, %xmm2 - movdqa %xmm0, %xmm1 - movdqa %xmm2, %xmm3 - psrlw $8, %xmm1 - psrlw $8, %xmm3 - psllw $8, %xmm0 - psllw $8, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm3, %xmm2 - movdqa %xmm0, (\i+3)*16(%esp) - movdqa %xmm2, (\i+4)*16(%esp) -.endm .text .p2align 5 @@ -373,14 +360,14 @@ _sha256_transform_4way: .p2align 5 sha256_transform_4way_swap: - p2bswap_esi_esp 0 - p2bswap_esi_esp 2 - p2bswap_esi_esp 4 - p2bswap_esi_esp 6 - p2bswap_esi_esp 8 - p2bswap_esi_esp 10 - p2bswap_esi_esp 12 - p2bswap_esi_esp 14 + p2bswap_esi_esp(0) + p2bswap_esi_esp(2) + p2bswap_esi_esp(4) + p2bswap_esi_esp(6) + p2bswap_esi_esp(8) + p2bswap_esi_esp(10) + p2bswap_esi_esp(12) + p2bswap_esi_esp(14) sha256_transform_4way_extend: leal 19*16(%esp), %ecx @@ -784,23 +771,23 @@ sha256d_ms_4way_extend_loop1: movdqa %xmm7, 15*16(%eax) sha256d_ms_4way_extend_loop2: - sha256_sse2_extend_doubleround 16 - sha256_sse2_extend_doubleround 18 - sha256_sse2_extend_doubleround 20 - sha256_sse2_extend_doubleround 22 - sha256_sse2_extend_doubleround 24 - sha256_sse2_extend_doubleround 26 - sha256_sse2_extend_doubleround 28 - sha256_sse2_extend_doubleround 30 - sha256_sse2_extend_doubleround 32 - sha256_sse2_extend_doubleround 34 - sha256_sse2_extend_doubleround 36 - sha256_sse2_extend_doubleround 38 - sha256_sse2_extend_doubleround 40 - sha256_sse2_extend_doubleround 42 + sha256_sse2_extend_doubleround(16) + sha256_sse2_extend_doubleround(18) + sha256_sse2_extend_doubleround(20) + sha256_sse2_extend_doubleround(22) + sha256_sse2_extend_doubleround(24) + sha256_sse2_extend_doubleround(26) + sha256_sse2_extend_doubleround(28) + sha256_sse2_extend_doubleround(30) + sha256_sse2_extend_doubleround(32) + sha256_sse2_extend_doubleround(34) + sha256_sse2_extend_doubleround(36) + sha256_sse2_extend_doubleround(38) + sha256_sse2_extend_doubleround(40) + sha256_sse2_extend_doubleround(42) jz sha256d_ms_4way_extend_coda2 - sha256_sse2_extend_doubleround 44 - sha256_sse2_extend_doubleround 46 + sha256_sse2_extend_doubleround(44) + sha256_sse2_extend_doubleround(46) movdqa 0(%ecx), %xmm3 movdqa 16(%ecx), %xmm0 @@ -818,30 +805,30 @@ sha256d_ms_4way_extend_loop2: jmp sha256d_ms_4way_main_loop1 sha256d_ms_4way_main_loop2: - sha256_sse2_main_round 0 - sha256_sse2_main_round 1 - sha256_sse2_main_round 2 + sha256_sse2_main_round(0) + sha256_sse2_main_round(1) + sha256_sse2_main_round(2) sha256d_ms_4way_main_loop1: - sha256_sse2_main_round 3 - sha256_sse2_main_quadround 4 - sha256_sse2_main_quadround 8 - sha256_sse2_main_quadround 12 - sha256_sse2_main_quadround 16 - sha256_sse2_main_quadround 20 - sha256_sse2_main_quadround 24 - sha256_sse2_main_quadround 28 - sha256_sse2_main_quadround 32 - sha256_sse2_main_quadround 36 - sha256_sse2_main_quadround 40 - sha256_sse2_main_quadround 44 - sha256_sse2_main_quadround 48 - sha256_sse2_main_quadround 52 - sha256_sse2_main_round 56 + sha256_sse2_main_round(3) + sha256_sse2_main_quadround(4) + sha256_sse2_main_quadround(8) + sha256_sse2_main_quadround(12) + sha256_sse2_main_quadround(16) + sha256_sse2_main_quadround(20) + sha256_sse2_main_quadround(24) + sha256_sse2_main_quadround(28) + sha256_sse2_main_quadround(32) + sha256_sse2_main_quadround(36) + sha256_sse2_main_quadround(40) + sha256_sse2_main_quadround(44) + sha256_sse2_main_quadround(48) + sha256_sse2_main_quadround(52) + sha256_sse2_main_round(56) jz sha256d_ms_4way_finish - sha256_sse2_main_round 57 - sha256_sse2_main_round 58 - sha256_sse2_main_round 59 - sha256_sse2_main_quadround 60 + sha256_sse2_main_round(57) + sha256_sse2_main_round(58) + sha256_sse2_main_round(59) + sha256_sse2_main_quadround(60) movdqa 5*16(%esp), %xmm1 movdqa 6*16(%esp), %xmm2 @@ -928,8 +915,8 @@ sha256d_ms_4way_main_loop1: movdqa %xmm3, 0*16(%eax) movdqa %xmm7, 1*16(%eax) - sha256_sse2_extend_doubleround 2 - sha256_sse2_extend_doubleround 4 + sha256_sse2_extend_doubleround(2) + sha256_sse2_extend_doubleround(4) movdqa -9*16(%eax), %xmm0 movdqa sha256d_4preext2_23, %xmm4 @@ -1104,7 +1091,7 @@ sha256d_ms_4way_main_loop1: jmp sha256d_ms_4way_extend_loop2 sha256d_ms_4way_extend_coda2: - sha256_sse2_extend_round 44 + sha256_sse2_extend_round(44) movdqa sha256_4h+0, %xmm7 movdqa sha256_4h+16, %xmm5 @@ -1121,42 +1108,42 @@ sha256d_ms_4way_extend_coda2: leal 48(%esp), %eax jmp sha256d_ms_4way_main_loop2 -.macro sha256_sse2_main_round_red i, r7 - movdqa 16*(\i)(%eax), %xmm6 - paddd 16*(\i)+sha256_4k, %xmm6 - paddd 32(%esp), %xmm6 - movdqa %xmm0, %xmm1 - movdqa 16(%esp), %xmm2 - paddd \r7, %xmm6 - pandn %xmm2, %xmm1 - movdqa %xmm2, 32(%esp) - movdqa 0(%esp), %xmm2 - movdqa %xmm2, 16(%esp) - pand %xmm0, %xmm2 - pxor %xmm2, %xmm1 - movdqa %xmm0, 0(%esp) - paddd %xmm1, %xmm6 - movdqa %xmm0, %xmm1 - psrld $6, %xmm0 - movdqa %xmm0, %xmm2 - pslld $7, %xmm1 - psrld $5, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $14, %xmm1 - psrld $14, %xmm2 - pxor %xmm1, %xmm0 - pxor %xmm2, %xmm0 - pslld $5, %xmm1 - pxor %xmm1, %xmm0 - paddd %xmm6, %xmm0 -.endm +#define sha256_sse2_main_round_red(i, r7) \ + movdqa 16*(i)(%eax), %xmm6; \ + paddd 16*(i)+sha256_4k, %xmm6; \ + paddd 32(%esp), %xmm6; \ + movdqa %xmm0, %xmm1; \ + movdqa 16(%esp), %xmm2; \ + paddd r7, %xmm6; \ + pandn %xmm2, %xmm1; \ + movdqa %xmm2, 32(%esp); \ + movdqa 0(%esp), %xmm2; \ + movdqa %xmm2, 16(%esp); \ + pand %xmm0, %xmm2; \ + pxor %xmm2, %xmm1; \ + movdqa %xmm0, 0(%esp); \ + paddd %xmm1, %xmm6; \ + movdqa %xmm0, %xmm1; \ + psrld $6, %xmm0; \ + movdqa %xmm0, %xmm2; \ + pslld $7, %xmm1; \ + psrld $5, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $14, %xmm1; \ + psrld $14, %xmm2; \ + pxor %xmm1, %xmm0; \ + pxor %xmm2, %xmm0; \ + pslld $5, %xmm1; \ + pxor %xmm1, %xmm0; \ + paddd %xmm6, %xmm0; \ + sha256d_ms_4way_finish: - sha256_sse2_main_round_red 57, %xmm3 - sha256_sse2_main_round_red 58, %xmm4 - sha256_sse2_main_round_red 59, %xmm5 - sha256_sse2_main_round_red 60, %xmm7 + sha256_sse2_main_round_red(57, %xmm3) + sha256_sse2_main_round_red(58, %xmm4) + sha256_sse2_main_round_red(59, %xmm5) + sha256_sse2_main_round_red(60, %xmm7) paddd sha256_4h+112, %xmm0 movdqa %xmm0, 112(%edi)